## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 555039 29.7 1242597 66.4 686457 36.7
## Vcells 1017255 7.8 8388608 64.0 1876641 14.4
library(magrittr)
library(data.table)
library(knitr)
library(ggplot2)
library(ComplexUpset)
`%!in%` = Negate(`%in%`)
`%nin%` = Negate(`%in%`)
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))fp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
ath.gmm = gmm[gmm$plant == 'ath', ]
setDT(ath.gmm)
ath.gmm[, plant := NULL]
ath.gmm[, source := NULL]
colnames(ath.gmm)[2:4] = paste('ath', colnames(ath.gmm)[2:4], sep = '_')note: some duplicated ids in PSS
fp = file.path('..', 'input', 'ath-annot', 'Phytozome', 'PhytozomeV12',
'early_release', 'Athaliana_447_Araport11', 'annotation')
# fn = 'Araport11_GFF3_genes_transposons.current_utf8_attributes_CB.tsv'
fn = 'Athaliana_447_Araport11.geneName.txt'
gn = data.table::fread(file.path(fp, fn), header = FALSE, fill = TRUE)
colnames(gn)[2] = 'athName'
gn$V1 = sub('\\..*', '', gn$V1)
gn = gn[!duplicated(gn), ]
fn = 'Athaliana_447_Araport11.synonym.txt'
sn = data.table::fread(file.path(fp, fn), header = FALSE, fill = TRUE)
sn[, merged_column := apply(.SD, 1, function(x) {
# Remove NA and empty strings
x = x[!is.na(x) & x != ""]
paste(x, collapse = " | ")
}), .SDcols = 2:ncol(sn)]
# Optionally, remove the original columns V2 to V15
sn[, (2:(ncol(sn)-1)) := NULL]
colnames(sn)[2] = 'athSynonims'
sn$V1 = sub('\\..*', '', sn$V1)
sn = sn[!duplicated(sn), ]
fp = file.path('..', 'input', 'SKM_2025-07-08')
fn = 'rxn-nodes-public.tsv'
pss = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
ind = grep('^name$|^all_pathways|^short_name$', colnames(pss), value = TRUE)
pss = pss[, ..ind]
ind = grep('\\[', pss$name)
pss = pss[ind, ]
pss[, ids_string := stringr::str_extract(name, "(?<=\\[)[^\\]]+(?=\\])")]
pss[, ids_list := strsplit(ids_string, split = ",")]
max_ids = max(lengths(pss$ids_list))
for (i in seq_len(max_ids)) {
pss[[paste0("id_", i)]] = sapply(pss$ids_list, function(x) ifelse(length(x) >= i, x[i], NA))
}
pss[, c("ids_string", "ids_list") := NULL]
pss_long = melt(
pss,
id.vars = c("name", "all_pathways", 'short_name'), # Columns to keep as is
measure.vars = patterns("^id_"), # Columns to melt (all starting with "id_")
variable.name = "id_num", # Name for the melted variable column
value.name = "id" # Name for the melted value column
)
pss_long = pss_long[!is.na(id) & id != ""]
pss_long[, id_num := NULL]
pss_long[, name := NULL]
pss_long$id = sub('\\..*', '', pss_long$id)
pss_long = pss_long[!duplicated(pss_long), ]
table(duplicated(pss_long$id))##
## FALSE TRUE
## 816 24
pss_long %>%
dplyr::filter(id %in% id[duplicated(id)] & stringr::str_starts(id, "^AT")) %>%
dplyr::arrange(id) %>%
print()## all_pathways short_name id
## <char> <char> <char>
## 1: Hormone - Ethylene (ET) ORA59 AT1G06160
## 2: Hormone - Ethylene (ET) ERF/ORA59 AT1G06160
## 3: Hormone - Salicylic acid (SA) TCP8 AT1G58100
## 4: Hormone - Salicylic acid (SA) TCP8,14,15 AT1G58100
## 5: Hormone - Ethylene (ET) EDF2 AT1G68840
## 6: Hormone - Ethylene (ET) ERF/EDF AT1G68840
## 7: Signalling - Heat-shock proteins (HSPs),Stress - Heat HSP70 AT3G12580
## 8: Signalling - Heat-shock proteins (HSPs) HSP AT3G12580
## 9: Hormone - Ethylene (ET) ERF1 AT3G23240
## 10: Hormone - Ethylene (ET) ERF/EDF AT3G23240
## 11: Hormone - Ethylene (ET) ERF6 AT4G17490
## 12: Hormone - Ethylene (ET) ERF/ORA59 AT4G17490
## 13: Hormone - Ethylene (ET) ERF1 AT4G17500
## 14: Hormone - Ethylene (ET) ERF/EDF AT4G17500
## 15: Signalling - Heat-shock proteins (HSPs) MED37E AT5G02500
## 16: Signalling - Heat-shock proteins (HSPs) HSP AT5G02500
## 17: Hormone - Ethylene (ET) ERF096 AT5G43410
## 18: Hormone - Ethylene (ET) ERF/EDF AT5G43410
## 19: Hormone - Ethylene (ET) ERF5 AT5G47230
## 20: Hormone - Ethylene (ET) ERF/ORA59 AT5G47230
## 21: Hormone - Ethylene (ET) ERF105 AT5G51190
## 22: Hormone - Ethylene (ET) ERF/ORA59 AT5G51190
## 23: Signalling - Heat-shock proteins (HSPs) HSP18.1-CI AT5G59720
## 24: Signalling - Heat-shock proteins (HSPs) HSP AT5G59720
## 25: Hormone - Ethylene (ET) ERF104 AT5G61600
## 26: Hormone - Ethylene (ET) ERF/EDF AT5G61600
## all_pathways short_name id
pss_long = pss_long %>%
dplyr::filter(stringr::str_starts(id, "AT")) %>%
dplyr::group_by(id) %>%
dplyr::summarise(
dplyr::across(
.cols = dplyr::everything(),
.fns = ~ {
vals = unique(na.omit(.))
if (length(vals) > 1) paste(vals, collapse = " | ")
else if (length(vals) == 1) vals
else NA_character_
}
),
.groups = "drop"
)
pss_long[pss_long == ""] = "-"
gn[is.na(gn)] = "-"
sn[is.na(sn)] = "-"
ath.annot = pss_long %>%
dplyr::full_join(gn, by = c("id" = "V1")) %>%
dplyr::full_join(sn, by = c("id" = "V1"))Note: 35.2 bin matcheswill be ignored
| Plant Name | Label | JCVI-MCScan | Compara Plants | Plaza | OrthoDB | FastOma | RBH | Mercator |
|---|---|---|---|---|---|---|---|---|
| Malus domestica | apple | mdo_GDv1 | malus_domestica_golden | mdo | mdo | mdo | mdo | mdo |
| mdo_HChap1 | ||||||||
| Prunus persica | ppe | ppe | prunus_persica | ppe | ppe | pper | ppe | pper |
| Prunus dulcis / P. amygdalus | almond | almond | prunus_dulcis | pdul | pdul | pdul | pdul | |
| Prunus avium | wild cherry | wildcherry | prunus_avium | pavi | pavi | pavi | pavi | |
| Prunus armeniaca | apricot | apricot | parm | parm | parm | parm | ||
| Prunus cerasifera | cherry plum | cherryplum | pcer | pcer | pcer | |||
| Pyrus | pear | pear | pcox | pcox | pcox | |||
| Prunus sibirica | Siberian apricot | siberianapricot | psib | psib | psib | psib |
params_list <- list(
plantName = 'mdo'
, plantNameOut = "apple"
, plantDirOut = file.path('..', 'reports', group, "apple")
, flag = 1
)
env <- new.env()
list2env(params_list, envir = env)<environment: 0x000002603af3d5e8>
##
##
## processing file: ./09_translate-child_strict.Rmd
| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-10] | |…… | 11% | |…….. | 15% [unnamed-chunk-11] | |……… | 19% | |……….. | 22% [unnamed-chunk-12] | |…………. | 26% | |…………… | 30% [unnamed-chunk-13] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-14] | |………………… | 41% | |………………….. | 44% [unnamed-chunk-15] | |……………………. | 48% | |…………………….. | 52% [unnamed-chunk-16] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-17] | |………………………….. | 63% | |……………………………. | 67% [unnamed-chunk-18] | |……………………………… | 70% | |……………………………….. | 74% [unnamed-chunk-19] | |…………………………………. | 78% | |…………………………………… | 81% [unnamed-chunk-20] | |……………………………………. | 85% | |……………………………………… | 89% [unnamed-chunk-21] | |……………………………………….. | 93% | |…………………………………………. | 96% [unnamed-chunk-22] | |……………………………………………| 100%
fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]
df = NULL
for (i in fl){
print(i)
dt = data.table::fread(i)
us = unique(dt$source)
if(us == 'compara') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'FastOMA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'MCScanX') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'PLAZA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'OrthoDB') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'RBH') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else cat ('Ignore source(s):', us, '\n')
}## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
## Pre filter Sources:
## 20997 77128 32036 56069 30068 37682
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID"
df %>%
dplyr::group_by(source) %>%
dplyr::slice_head(n = 2) %>%
dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
dplyr::arrange(source) %>%
dplyr::ungroup() -> first_last_three_per_source
print(first_last_three_per_source, n = nrow(first_last_three_per_source))## # A tibble: 24 × 5
## from_geneID from_plant to_geneID to_plant source
## <chr> <chr> <chr> <chr> <chr>
## 1 AT4G10330 ath Maldo.hc.v1a1.ch10A.g00003 mdo FastOMA
## 2 AT4G10340 ath Maldo.hc.v1a1.ch10A.g00004 mdo FastOMA
## 3 AT5G44410 ath g1 mdo FastOMA
## 4 AT5G44440 ath g1 mdo FastOMA
## 5 AT1G03770 ath Maldo.hc.v1a1.ch10A.g02825 mdo MCScanX
## 6 AT1G03800 ath Maldo.hc.v1a1.ch10A.g02815 mdo MCScanX
## 7 ATMG00910 ath Maldo.hc.v1a1.sc45A.g49878 mdo MCScanX
## 8 ATMG01020 ath Maldo.hc.v1a1.sc45A.g49883 mdo MCScanX
## 9 AT2G46320 ath Maldo.hc.v1a1.ch1A.g26266 mdo OrthoDB
## 10 AT4G27940 ath Maldo.hc.v1a1.ch1A.g26266 mdo OrthoDB
## 11 AT2G07695 ath Maldo.hc.v1a1.sc164A.g48922 mdo OrthoDB
## 12 AT2G07695 ath Maldo.hc.v1a1.sc119A.g48697 mdo OrthoDB
## 13 AT2G28190 ath Maldo.hc.v1a1.ch6A.g38979 mdo PLAZA
## 14 AT2G07732 ath Maldo.hc.v1a1.sc90A.g49976 mdo PLAZA
## 15 AT5G17400 ath Maldo.hc.v1a1.ch9A.g48118 mdo PLAZA
## 16 AT2G47300 ath Maldo.hc.v1a1.ch17A.g24110 mdo PLAZA
## 17 AT1G01020 ath Maldo.hc.v1a1.ch7A.g41990 mdo RBH
## 18 AT1G01030 ath Maldo.hc.v1a1.ch1A.g25187 mdo RBH
## 19 ATMG01410 ath Maldo.hc.v1a1.sc36A.g49760 mdo RBH
## 20 ATMG01410 ath Maldo.hc.v1a1.sc71A.g49908 mdo RBH
## 21 AT4G39400 ath Maldo.hc.v1a1.ch2A.g27501 mdo compara
## 22 AT4G39400 ath Maldo.hc.v1a1.ch15A.g17036 mdo compara
## 23 AT1G65880 ath Maldo.hc.v1a1.ch17A.g24066 mdo compara
## 24 AT5G52820 ath Maldo.hc.v1a1.ch17A.g24067 mdo compara
rm(list = setdiff(ls(), c("df",
"ath.gmm", "ath.annot",
"group",
"plantName",
"plantNameOut",
"plantDirOut",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1962915 104.9 3293471 175.9 3125948 167.0
## Vcells 19558200 149.3 31783875 242.5 31746152 242.3
## [1] 23046
## [1] 34410
##
## compara FastOMA MCScanX OrthoDB PLAZA RBH
## 20997 77128 32036 56069 30068 37682
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {
source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
source_cols = c("MCScanX", "FastOMA", 'RBH')
}
dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]
hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))dff = as.data.frame(dt.wide)
upset_plot = upset(
dff,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods") +
theme(legend.position = "none")
# Print or/and save the plot
print(upset_plot)# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)
ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)),
grep('from_count', colnames(dt.wide)),
grep('to_count', colnames(dt.wide)),
grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, ath.annot, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)
nin = ath.annot[which(!(ath.annot$id[!is.na(ath.annot$all_pathways)] %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
setorder(nin, short_name)
openxlsx::write.xlsx(nin,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-24-strict.xlsx'),
asTable = TRUE) # change namefp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]
colnames(combined)[2:4] = paste('plant', colnames(combined)[2:4], sep = '_')
colnames(df)## [1] "from_geneID" "to_geneID" "FastOMA" "MCScanX"
## [5] "OrthoDB" "PLAZA" "RBH" "compara"
## [9] "from_count" "to_count" "count_evidence" "ath_BINCODE"
## [13] "ath_NAME" "ath_DESCRIPTION" "all_pathways" "short_name"
## [17] "athName" "athSynonims"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$plant_BINCODE))##
## FALSE TRUE
## 117481 27
## [1] "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1"
## [16] "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1"
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]rm(list = setdiff(ls(), c("df",
"ath.gmm", "ath.annot",
"group",
"plantName",
"plantNameOut",
"plantDirOut",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 7350008 392.6 13692148 731.3 13504928 721.3
## Vcells 125473339 957.3 195447237 1491.2 195431551 1491.1
MapMan Mercator matches: first three levels only
df = df[!duplicated(df), ]
compare_bin <- function(athMercator, plantXMercator) {
# Helper: Split and truncate tokens to first 3 dot-separated levels
split_tokens <- function(code) {
if (is.na(code) || trimws(code) == "") return(character(0))
parts <- unlist(strsplit(code, "\\|"))
tokens <- unlist(strsplit(parts, ";"))
tokens <- unique(trimws(tokens))
trunc3levels <- function(token) {
levels <- unlist(strsplit(token, "\\."))
paste(head(levels, 3), collapse = ".")
}
unique(sapply(tokens, trunc3levels))
}
bin_set <- split_tokens(athMercator)
v4_set <- split_tokens(plantXMercator)
# If both sets are empty, return "no match"
if (length(bin_set) == 0 && length(v4_set) == 0) {
return("no match")
}
# Check for redundant annotation (e.g. "35.2|35.2|35.2")
v4_parts <- unlist(strsplit(plantXMercator, "\\|"))
if (length(bin_set) == 1 &&
length(v4_parts) > 1 &&
all(split_tokens(plantXMercator) == bin_set)) {
result <- paste0("100% match based on ", bin_set)
if (result == "100% match based on 35.2") return("bad MapMan")
return(result)
}
# Check for exact match
if (setequal(bin_set, v4_set) && length(bin_set) > 0) {
result <- paste0("100% match based on ", paste(bin_set, collapse = ", "))
if (result == "100% match based on 35.2") return("bad MapMan")
return(result)
}
# Check for partial match
common_tokens <- intersect(bin_set, v4_set)
if (length(common_tokens) > 0) {
return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
}
return("no match")
}
df = df %>%
dplyr::rowwise() %>%
dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, plant_BINCODE)) %>% # change name
dplyr::ungroup()
table(df$ath_BINCODE[df$MapMan4_Match == "bad MapMan"], df$plant_BINCODE[df$MapMan4_Match == "bad MapMan"])##
## 35.2
## 35.2 23180
## #### #### before filter #### ####
cat("\t# ath unigue genes:", length(unique(df$from_geneID)), "\n",
"\t#", plantName, "unigue genes:", length(unique(df$to_geneID)), "\n",
"\t# ath highest connection degree: ", range(df$from_count)[2], "\n",
"\t#", plantName, "highest connection degree: ", range(df$to_count)[2], "\n",
"\t# genes in ath with degree > 30: ", length(unique(df$from_geneID[df$from_count > 30])), "\n",
"\t# genes in", plantName, "with degree > 30: ", length(unique(df$to_geneID[df$to_count > 30])), "\n\n"
)## # ath unigue genes: 23046
## # mdo unigue genes: 34410
## # ath highest connection degree: 128
## # mdo highest connection degree: 116
## # genes in ath with degree > 30: 319
## # genes in mdo with degree > 30: 288
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
if (flag == 1) {
methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) { # make flags
methods = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
methods = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
methods = c("MCScanX", "FastOMA", 'RBH')
}
match_categories = c("no match", "100% match", "partial match", "bad MapMan")
long_dt = rbindlist(lapply(methods, function(method) {
dt[, .(
Method = method,
Match_Type = match_categories,
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
)
)]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
fcase(
as.character(Match_Type) == "bad MapMan", "35.2",
default = as.character(Match_Type)
),
levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match bad MapMan no match partial match
## 72085 23180 13328 8915
##
## 100% match bad MapMan no match partial match
## 1 36847 10783 10916 7960
## 2 9865 3217 1663 576
## 3 6786 2076 473 178
## 4 6739 2218 150 104
## 5 7352 2776 96 69
## 6 4496 2110 30 28
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Consistency of MapMan Match by # methods coverage",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
if (flag != 4) {
special_methods = c("OrthoDB", "FastOMA", 'RBH')
} else {
special_methods = c("FastOMA", 'RBH')
}
dt[, filter_criteria := "reject"]
covered_genes = character(nrow(dt)) # reset if needed
print(table(dt$filter_criteria))##
## reject
## 117508
priority_methods = setdiff(methods, special_methods)
dt[, use := TRUE]
method = NULL
for (method in priority_methods) {
# rows indices satisfying all conditions; . causes issue!
eligible_rows <- which(
dt$filter_criteria == "reject" &
dt[[method]] == TRUE &
dt$to_geneID %nin% covered_genes &
dt$use == TRUE
)
if(length(eligible_rows) > 0) {
# Update FILTER
dt[eligible_rows, filter_criteria := method]
# Update covered_genes
covered_genes = unique(c(covered_genes, dt[eligible_rows, to_geneID]))
# Update use
dt[to_geneID %in% covered_genes, use := FALSE]
} else {
cat("No eligible rows found for:", method, "\n")
}
}
# uncovered genes
eligible_rows = which(
dt$filter_criteria == "reject" &
dt$use == TRUE &
grepl("match based on", dt$MapMan4_Match, ignore.case = TRUE) # MapMan match
)
if (length(eligible_rows) > 0) {
sub_dt = dt[eligible_rows] # a temporary snapshot used for fast vectorized computation
# how many special methods are TRUE per row
method_matrix = sapply(special_methods, function(m) sub_dt[[m]])
count_covered = rowSums(method_matrix, na.rm = TRUE)
new_criteria = rep("reject", length(eligible_rows))
# For rows with 2 or 3 methods
for (j in seq_along(eligible_rows)) {
if (count_covered[j] == 3) {
new_criteria[j] = "OrthoDB_FastOMA_RBH_MapMan4"
} else if (count_covered[j] == 2) {
covered_by = special_methods[method_matrix[j, ]]
new_criteria[j] = paste0(paste(sort(covered_by), collapse = "_"), "_MapMan4")
}
}
# update
dt[eligible_rows, filter_criteria := new_criteria]
dt[eligible_rows, use := FALSE]
} else {
print("Nothing here!")
}
# table(dt$filter_criteria)
# table(dt[dt$use, MapMan4_Match])
dt[, use := NULL]
df = dt
data.table::fwrite(df,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-24-strict.txt'),
sep = '\t')
openxlsx::write.xlsx(df,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-24-strict.xlsx'),
asTable = TRUE)rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]
# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]
kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]
par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "ath", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "ath kept", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = plantName, xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = paste(plantName, "kept"), xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Degrees before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)long_dt = rbindlist(lapply(methods, function(method) {
kept[, .(
Method = method,
Match_Type = match_categories,
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
)
)]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
fcase(
as.character(Match_Type) == "bad MapMan", "35.2",
default = as.character(Match_Type)
),
levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method (post filter)",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match bad MapMan no match partial match
## 30593 11555 2976 1019
##
## 100% match bad MapMan no match partial match
## 1 2523 1870 1615 345
## 2 5377 1399 709 352
## 3 5006 1479 383 135
## 4 6010 2003 144 95
## 5 7181 2694 95 64
## 6 4496 2110 30 28
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Consistency of MapMan Match by # methods coverage (post filter)",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria",
names(kept), value = TRUE)]
# rename 'bad MapMan' to '35.2'
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
keptsub$MapMan4_Match = ifelse(keptsub$MapMan4_Match == "bad MapMan", "35.2", keptsub$MapMan4_Match)
# frequency table
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
# Filter
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria,
# levels = c("MCScanX", "compara", "PLAZA",
# "OrthoDB_FastOMA_RBH",
# "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH",
# "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
levels = c("MCScanX", "compara", "PLAZA",
"OrthoDB_FastOMA_RBH_MapMan4",
"FastOMA_OrthoDB_MapMan4", "OrthoDB_RBH_MapMan4", "FastOMA_RBH_MapMan4"
))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', '35.2', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
labs(
title = "Selection by method and methods coverage",
x = "Filter criteria",
y = "Frequency",
fill = "MapMan4 Match"
) +
theme_minimal() +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
theme(
axis.text.x = element_text(angle = 90, hjust = 1),
panel.border = element_rect(color = "black", fill = NA, size = 1), # border around each facet
)## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
openxlsx::write.xlsx(rejected,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-removed_2025-09-24-strict.xlsx'),
asTable = TRUE)
edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
geneID = names(comp$membership),
weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
# # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept,
paste0('../output/', plantNameOut , '-ath_orthologues-kept_2025-09-24-strict.xlsx'),
asTable = TRUE)
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) { # make flags
source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
source_cols = c("MCScanX", "FastOMA", 'RBH')
}
# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
kept,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")
# Print or save the plot
print(upset_plot)ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-24-strict.pdf"),
plot = upset_plot, width = 24, height = 6, device = "pdf")
cat('#### #### after filter #### #### \n')## #### #### after filter #### ####
cat("\t# ath unigue genes:", length(unique(kept$from_geneID)), "\n",
"\t#", plantName, "unigue genes:", length(unique(kept$to_geneID)), "\n",
"\t# ath highest connection degree: ", range(kept$from_count)[2], "\n",
"\t#", plantName, "highest connection degree: ", range(kept$to_count)[2], "\n",
"\t# genes in ath with degree > 30: ", length(unique(kept$from_geneID[kept$from_count > 30])), "\n",
"\t# genes in", plantName, "with degree > 30: ", length(unique(kept$to_geneID[kept$to_count > 30])), "\n\n"
)## # ath unigue genes: 19719
## # mdo unigue genes: 29104
## # ath highest connection degree: 44
## # mdo highest connection degree: 45
## # genes in ath with degree > 30: 7
## # genes in mdo with degree > 30: 4
filter_counts = as.data.table(table(dt$filter_criteria))
setnames(filter_counts, c("filter_criteria", "Count"))
# Define desired order
desired_order = c(
"MCScanX",
"compara",
"PLAZA",
# "OrthoDB_FastOMA_RBH",
# "OrthoDB_RBH",
# "FastOMA_OrthoDB",
# "FastOMA_RBH",
# "OrthoDB_MapMan4",
# "RBH_MapMan4",
# "FastOMA_MapMan4
"OrthoDB_FastOMA_RBH_MapMan4",
"FastOMA_OrthoDB_MapMan4",
"OrthoDB_RBH_MapMan4",
"FastOMA_RBH_MapMan4",
"reject"
)
filter_counts = filter_counts[filter_criteria %in% desired_order]
filter_counts[, filter_criteria := factor(filter_criteria, levels = desired_order)]
setorder(filter_counts, filter_criteria)
print(filter_counts)## filter_criteria Count
## <fctr> <int>
## 1: MCScanX 32036
## 2: compara 5278
## 3: PLAZA 6568
## 4: OrthoDB_FastOMA_RBH_MapMan4 444
## 5: FastOMA_OrthoDB_MapMan4 822
## 6: OrthoDB_RBH_MapMan4 485
## 7: FastOMA_RBH_MapMan4 510
## 8: reject 71365
pss = ath.annot[which(!is.na(ath.annot$all_pathways)), ]
pss = pss[, grep("id$|all_pathways$|short_name$", colnames(pss))]
pss = pss[!duplicated(pss), ]
pss = merge(pss,
df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria",
names(dt), value = TRUE)],
by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss = pss[grep('^AT', pss$id), ]
pss = pss[!duplicated(pss), ]
table(pss$filter_criteria)##
## compara FastOMA_OrthoDB_MapMan4
## 163 61
## FastOMA_RBH_MapMan4 MCScanX
## 27 1336
## OrthoDB_FastOMA_RBH_MapMan4 OrthoDB_RBH_MapMan4
## 23 25
## PLAZA reject
## 264 2046
openxlsx::write.xlsx(pss,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-24-strict.xlsx'),
asTable = TRUE)params_list <- list(
plantName = 'ppe'
, plantNameOut = "peach"
, plantDirOut = file.path('..', 'reports', group, "peach")
, flag = 1
)
env <- new.env()
list2env(params_list, envir = env)<environment: 0x00000260c8320e08>
##
##
## processing file: ./09_translate-child_strict.Rmd
| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-38] | |…… | 11% | |…….. | 15% [unnamed-chunk-39] | |……… | 19% | |……….. | 22% [unnamed-chunk-40] | |…………. | 26% | |…………… | 30% [unnamed-chunk-41] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-42] | |………………… | 41% | |………………….. | 44% [unnamed-chunk-43] | |……………………. | 48% | |…………………….. | 52% [unnamed-chunk-44] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-45] | |………………………….. | 63% | |……………………………. | 67% [unnamed-chunk-46] | |……………………………… | 70% | |……………………………….. | 74% [unnamed-chunk-47] | |…………………………………. | 78% | |…………………………………… | 81% [unnamed-chunk-48] | |……………………………………. | 85% | |……………………………………… | 89% [unnamed-chunk-49] | |……………………………………….. | 93% | |…………………………………………. | 96% [unnamed-chunk-50] | |……………………………………………| 100%
fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]
df = NULL
for (i in fl){
print(i)
dt = data.table::fread(i)
us = unique(dt$source)
if(us == 'compara') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'FastOMA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'MCScanX') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'PLAZA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'OrthoDB') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'RBH') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else cat ('Ignore source(s):', us, '\n')
}## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
## Pre filter Sources:
## 16193 44006 17894 38370 20791 24564
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID"
df %>%
dplyr::group_by(source) %>%
dplyr::slice_head(n = 2) %>%
dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
dplyr::arrange(source) %>%
dplyr::ungroup() -> first_last_three_per_source
print(first_last_three_per_source, n = nrow(first_last_three_per_source))## # A tibble: 24 × 5
## from_geneID from_plant to_geneID to_plant source
## <chr> <chr> <chr> <chr> <chr>
## 1 AT1G12040 ath Prupe.1G000500 ppe FastOMA
## 2 AT1G62440 ath Prupe.1G000500 ppe FastOMA
## 3 AT1G61010 ath Prupe.I006100 ppe FastOMA
## 4 AT1G61010 ath Prupe.I006200 ppe FastOMA
## 5 AT1G04230 ath Prupe.1G027200 ppe MCScanX
## 6 AT1G04240 ath Prupe.1G027500 ppe MCScanX
## 7 ATCG00530 ath Prupe.7G029500 ppe MCScanX
## 8 ATCG00680 ath Prupe.7G029800 ppe MCScanX
## 9 AT3G17900 ath Prupe.1G267800 ppe OrthoDB
## 10 AT4G35230 ath Prupe.1G355500 ppe OrthoDB
## 11 AT2G07675 ath Prupe.6G146800 ppe OrthoDB
## 12 ATMG00980 ath Prupe.6G146800 ppe OrthoDB
## 13 AT3G02890 ath Prupe.2G329000 ppe PLAZA
## 14 AT5G16680 ath Prupe.2G329000 ppe PLAZA
## 15 AT4G03170 ath Prupe.4G260600 ppe PLAZA
## 16 AT4G03160 ath Prupe.4G260600 ppe PLAZA
## 17 AT1G01030 ath Prupe.5G134900 ppe RBH
## 18 AT1G01040 ath Prupe.2G200900 ppe RBH
## 19 ATMG01250 ath Prupe.6G123900 ppe RBH
## 20 ATMG01250 ath Prupe.7G164000 ppe RBH
## 21 AT5G01010 ath Prupe.6G300700 ppe compara
## 22 AT5G01020 ath Prupe.6G300300 ppe compara
## 23 AT1G80940 ath Prupe.3G103600 ppe compara
## 24 AT1G80950 ath Prupe.1G382400 ppe compara
rm(list = setdiff(ls(), c("df",
"ath.gmm", "ath.annot",
"group",
"plantName",
"plantNameOut",
"plantDirOut",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4886860 261.0 10953719 585.0 13692148 731.3
## Vcells 116113180 885.9 195447237 1491.2 195446431 1491.2
## [1] 23113
## [1] 21245
##
## compara FastOMA MCScanX OrthoDB PLAZA RBH
## 16193 44006 17894 38370 20791 24564
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {
source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
source_cols = c("MCScanX", "FastOMA", 'RBH')
}
dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]
hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))dff = as.data.frame(dt.wide)
upset_plot = upset(
dff,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods") +
theme(legend.position = "none")
# Print or/and save the plot
print(upset_plot)# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)
ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)),
grep('from_count', colnames(dt.wide)),
grep('to_count', colnames(dt.wide)),
grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, ath.annot, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)
nin = ath.annot[which(!(ath.annot$id[!is.na(ath.annot$all_pathways)] %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
setorder(nin, short_name)
openxlsx::write.xlsx(nin,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-24-strict.xlsx'),
asTable = TRUE) # change namefp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]
colnames(combined)[2:4] = paste('plant', colnames(combined)[2:4], sep = '_')
colnames(df)## [1] "from_geneID" "to_geneID" "FastOMA" "MCScanX"
## [5] "OrthoDB" "PLAZA" "RBH" "compara"
## [9] "from_count" "to_count" "count_evidence" "ath_BINCODE"
## [13] "ath_NAME" "ath_DESCRIPTION" "all_pathways" "short_name"
## [17] "athName" "athSynonims"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$plant_BINCODE))##
## FALSE TRUE
## 71222 231
## [1] "Prupe.I000100" "Prupe.I000200" "Prupe.I000200" "Prupe.I000200"
## [5] "Prupe.I000200" "Prupe.I000200" "Prupe.I000200" "Prupe.I000300"
## [9] "Prupe.I000300" "Prupe.I000300" "Prupe.I000400" "Prupe.I000400"
## [13] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
## [17] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
## [21] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
## [25] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
## [29] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
## [33] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
## [37] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
## [41] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
## [45] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
## [49] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000600"
## [53] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
## [57] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
## [61] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
## [65] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
## [69] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
## [73] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
## [77] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
## [81] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
## [85] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
## [89] "Prupe.I000600" "Prupe.I000600" "Prupe.I000700" "Prupe.I000700"
## [93] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [97] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [101] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [105] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [109] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [113] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [117] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [121] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [125] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [129] "Prupe.I000700" "Prupe.I000800" "Prupe.I000800" "Prupe.I000800"
## [133] "Prupe.I000800" "Prupe.I000800" "Prupe.I000800" "Prupe.I000800"
## [137] "Prupe.I000900" "Prupe.I000900" "Prupe.I000900" "Prupe.I000900"
## [141] "Prupe.I000900" "Prupe.I000900" "Prupe.I000900" "Prupe.I001000"
## [145] "Prupe.I001000" "Prupe.I001000" "Prupe.I001000" "Prupe.I001000"
## [149] "Prupe.I001000" "Prupe.I001000" "Prupe.I001100" "Prupe.I001100"
## [153] "Prupe.I001100" "Prupe.I001600" "Prupe.I001700" "Prupe.I001700"
## [157] "Prupe.I001700" "Prupe.I001800" "Prupe.I001800" "Prupe.I001800"
## [161] "Prupe.I001800" "Prupe.I001800" "Prupe.I001800" "Prupe.I001800"
## [165] "Prupe.I001800" "Prupe.I001800" "Prupe.I001900" "Prupe.I001900"
## [169] "Prupe.I002100" "Prupe.I002100" "Prupe.I002300" "Prupe.I002300"
## [173] "Prupe.I002300" "Prupe.I002400" "Prupe.I002400" "Prupe.I002600"
## [177] "Prupe.I002600" "Prupe.I002800" "Prupe.I002800" "Prupe.I002900"
## [181] "Prupe.I002900" "Prupe.I003000" "Prupe.I003000" "Prupe.I003100"
## [185] "Prupe.I003100" "Prupe.I003100" "Prupe.I003200" "Prupe.I003200"
## [189] "Prupe.I003200" "Prupe.I003300" "Prupe.I003400" "Prupe.I003400"
## [193] "Prupe.I003400" "Prupe.I003400" "Prupe.I003400" "Prupe.I003400"
## [197] "Prupe.I003400" "Prupe.I003800" "Prupe.I003800" "Prupe.I003900"
## [201] "Prupe.I003900" "Prupe.I004000" "Prupe.I004000" "Prupe.I004400"
## [205] "Prupe.I004400" "Prupe.I004400" "Prupe.I004400" "Prupe.I004400"
## [209] "Prupe.I004400" "Prupe.I004500" "Prupe.I004500" "Prupe.I004500"
## [213] "Prupe.I004500" "Prupe.I004500" "Prupe.I004500" "Prupe.I005000"
## [217] "Prupe.I005000" "Prupe.I005100" "Prupe.I005200" "Prupe.I005200"
## [221] "Prupe.I005200" "Prupe.I005200" "Prupe.I005200" "Prupe.I005500"
## [225] "Prupe.I005500" "Prupe.I005500" "Prupe.I005600" "Prupe.I005700"
## [229] "Prupe.I005800" "Prupe.I006100" "Prupe.I006200"
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]rm(list = setdiff(ls(), c("df",
"ath.gmm", "ath.annot",
"group",
"plantName",
"plantNameOut",
"plantDirOut",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 5562657 297.1 10953719 585.0 13692148 731.3
## Vcells 90359306 689.4 195447237 1491.2 195447236 1491.2
MapMan Mercator matches: first three levels only
df = df[!duplicated(df), ]
compare_bin <- function(athMercator, plantXMercator) {
# Helper: Split and truncate tokens to first 3 dot-separated levels
split_tokens <- function(code) {
if (is.na(code) || trimws(code) == "") return(character(0))
parts <- unlist(strsplit(code, "\\|"))
tokens <- unlist(strsplit(parts, ";"))
tokens <- unique(trimws(tokens))
trunc3levels <- function(token) {
levels <- unlist(strsplit(token, "\\."))
paste(head(levels, 3), collapse = ".")
}
unique(sapply(tokens, trunc3levels))
}
bin_set <- split_tokens(athMercator)
v4_set <- split_tokens(plantXMercator)
# If both sets are empty, return "no match"
if (length(bin_set) == 0 && length(v4_set) == 0) {
return("no match")
}
# Check for redundant annotation (e.g. "35.2|35.2|35.2")
v4_parts <- unlist(strsplit(plantXMercator, "\\|"))
if (length(bin_set) == 1 &&
length(v4_parts) > 1 &&
all(split_tokens(plantXMercator) == bin_set)) {
result <- paste0("100% match based on ", bin_set)
if (result == "100% match based on 35.2") return("bad MapMan")
return(result)
}
# Check for exact match
if (setequal(bin_set, v4_set) && length(bin_set) > 0) {
result <- paste0("100% match based on ", paste(bin_set, collapse = ", "))
if (result == "100% match based on 35.2") return("bad MapMan")
return(result)
}
# Check for partial match
common_tokens <- intersect(bin_set, v4_set)
if (length(common_tokens) > 0) {
return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
}
return("no match")
}
df = df %>%
dplyr::rowwise() %>%
dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, plant_BINCODE)) %>% # change name
dplyr::ungroup()
table(df$ath_BINCODE[df$MapMan4_Match == "bad MapMan"], df$plant_BINCODE[df$MapMan4_Match == "bad MapMan"])##
## 35.2
## 35.2 16428
## #### #### before filter #### ####
cat("\t# ath unigue genes:", length(unique(df$from_geneID)), "\n",
"\t#", plantName, "unigue genes:", length(unique(df$to_geneID)), "\n",
"\t# ath highest connection degree: ", range(df$from_count)[2], "\n",
"\t#", plantName, "highest connection degree: ", range(df$to_count)[2], "\n",
"\t# genes in ath with degree > 30: ", length(unique(df$from_geneID[df$from_count > 30])), "\n",
"\t# genes in", plantName, "with degree > 30: ", length(unique(df$to_geneID[df$to_count > 30])), "\n\n"
)## # ath unigue genes: 23113
## # ppe unigue genes: 21245
## # ath highest connection degree: 57
## # ppe highest connection degree: 115
## # genes in ath with degree > 30: 264
## # genes in ppe with degree > 30: 135
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
if (flag == 1) {
methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) { # make flags
methods = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
methods = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
methods = c("MCScanX", "FastOMA", 'RBH')
}
match_categories = c("no match", "100% match", "partial match", "bad MapMan")
long_dt = rbindlist(lapply(methods, function(method) {
dt[, .(
Method = method,
Match_Type = match_categories,
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
)
)]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
fcase(
as.character(Match_Type) == "bad MapMan", "35.2",
default = as.character(Match_Type)
),
levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match bad MapMan no match partial match
## 43931 16428 6360 4734
##
## 100% match bad MapMan no match partial match
## 1 21126 7929 5410 4119
## 2 6460 2272 560 368
## 3 4106 1255 185 103
## 4 4182 1339 103 68
## 5 4892 1998 68 51
## 6 3165 1635 34 25
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Consistency of MapMan Match by # methods coverage",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
if (flag != 4) {
special_methods = c("OrthoDB", "FastOMA", 'RBH')
} else {
special_methods = c("FastOMA", 'RBH')
}
dt[, filter_criteria := "reject"]
covered_genes = character(nrow(dt)) # reset if needed
print(table(dt$filter_criteria))##
## reject
## 71453
priority_methods = setdiff(methods, special_methods)
dt[, use := TRUE]
method = NULL
for (method in priority_methods) {
# rows indices satisfying all conditions; . causes issue!
eligible_rows <- which(
dt$filter_criteria == "reject" &
dt[[method]] == TRUE &
dt$to_geneID %nin% covered_genes &
dt$use == TRUE
)
if(length(eligible_rows) > 0) {
# Update FILTER
dt[eligible_rows, filter_criteria := method]
# Update covered_genes
covered_genes = unique(c(covered_genes, dt[eligible_rows, to_geneID]))
# Update use
dt[to_geneID %in% covered_genes, use := FALSE]
} else {
cat("No eligible rows found for:", method, "\n")
}
}
# uncovered genes
eligible_rows = which(
dt$filter_criteria == "reject" &
dt$use == TRUE &
grepl("match based on", dt$MapMan4_Match, ignore.case = TRUE) # MapMan match
)
if (length(eligible_rows) > 0) {
sub_dt = dt[eligible_rows] # a temporary snapshot used for fast vectorized computation
# how many special methods are TRUE per row
method_matrix = sapply(special_methods, function(m) sub_dt[[m]])
count_covered = rowSums(method_matrix, na.rm = TRUE)
new_criteria = rep("reject", length(eligible_rows))
# For rows with 2 or 3 methods
for (j in seq_along(eligible_rows)) {
if (count_covered[j] == 3) {
new_criteria[j] = "OrthoDB_FastOMA_RBH_MapMan4"
} else if (count_covered[j] == 2) {
covered_by = special_methods[method_matrix[j, ]]
new_criteria[j] = paste0(paste(sort(covered_by), collapse = "_"), "_MapMan4")
}
}
# update
dt[eligible_rows, filter_criteria := new_criteria]
dt[eligible_rows, use := FALSE]
} else {
print("Nothing here!")
}
# table(dt$filter_criteria)
# table(dt[dt$use, MapMan4_Match])
dt[, use := NULL]
df = dt
data.table::fwrite(df,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-24-strict.txt'),
sep = '\t')
openxlsx::write.xlsx(df,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-24-strict.xlsx'),
asTable = TRUE)rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]
# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]
kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]
par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "ath", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "ath kept", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = plantName, xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = paste(plantName, "kept"), xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Degrees before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)long_dt = rbindlist(lapply(methods, function(method) {
kept[, .(
Method = method,
Match_Type = match_categories,
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
)
)]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
fcase(
as.character(Match_Type) == "bad MapMan", "35.2",
default = as.character(Match_Type)
),
levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method (post filter)",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match bad MapMan no match partial match
## 19587 7852 1432 657
##
## 100% match bad MapMan no match partial match
## 1 1873 1402 853 215
## 2 3467 929 260 222
## 3 2797 808 125 81
## 4 3607 1168 92 63
## 5 4678 1910 68 51
## 6 3165 1635 34 25
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Consistency of MapMan Match by # methods coverage (post filter)",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria",
names(kept), value = TRUE)]
# rename 'bad MapMan' to '35.2'
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
keptsub$MapMan4_Match = ifelse(keptsub$MapMan4_Match == "bad MapMan", "35.2", keptsub$MapMan4_Match)
# frequency table
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
# Filter
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria,
# levels = c("MCScanX", "compara", "PLAZA",
# "OrthoDB_FastOMA_RBH",
# "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH",
# "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
levels = c("MCScanX", "compara", "PLAZA",
"OrthoDB_FastOMA_RBH_MapMan4",
"FastOMA_OrthoDB_MapMan4", "OrthoDB_RBH_MapMan4", "FastOMA_RBH_MapMan4"
))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', '35.2', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
labs(
title = "Selection by method and methods coverage",
x = "Filter criteria",
y = "Frequency",
fill = "MapMan4 Match"
) +
theme_minimal() +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
theme(
axis.text.x = element_text(angle = 90, hjust = 1),
panel.border = element_rect(color = "black", fill = NA, size = 1), # border around each facet
)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
openxlsx::write.xlsx(rejected,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-removed_2025-09-24-strict.xlsx'),
asTable = TRUE)
edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
geneID = names(comp$membership),
weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
# # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept,
paste0('../output/', plantNameOut , '-ath_orthologues-kept_2025-09-24-strict.xlsx'),
asTable = TRUE)
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) { # make flags
source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
source_cols = c("MCScanX", "FastOMA", 'RBH')
}
# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
kept,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")
# Print or save the plot
print(upset_plot)ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-24-strict.pdf"),
plot = upset_plot, width = 24, height = 6, device = "pdf")
cat('#### #### after filter #### #### \n')## #### #### after filter #### ####
cat("\t# ath unigue genes:", length(unique(kept$from_geneID)), "\n",
"\t#", plantName, "unigue genes:", length(unique(kept$to_geneID)), "\n",
"\t# ath highest connection degree: ", range(kept$from_count)[2], "\n",
"\t#", plantName, "highest connection degree: ", range(kept$to_count)[2], "\n",
"\t# genes in ath with degree > 30: ", length(unique(kept$from_geneID[kept$from_count > 30])), "\n",
"\t# genes in", plantName, "with degree > 30: ", length(unique(kept$to_geneID[kept$to_count > 30])), "\n\n"
)## # ath unigue genes: 19239
## # ppe unigue genes: 18731
## # ath highest connection degree: 40
## # ppe highest connection degree: 19
## # genes in ath with degree > 30: 2
## # genes in ppe with degree > 30: 0
filter_counts = as.data.table(table(dt$filter_criteria))
setnames(filter_counts, c("filter_criteria", "Count"))
# Define desired order
desired_order = c(
"MCScanX",
"compara",
"PLAZA",
# "OrthoDB_FastOMA_RBH",
# "OrthoDB_RBH",
# "FastOMA_OrthoDB",
# "FastOMA_RBH",
# "OrthoDB_MapMan4",
# "RBH_MapMan4",
# "FastOMA_MapMan4
"OrthoDB_FastOMA_RBH_MapMan4",
"FastOMA_OrthoDB_MapMan4",
"OrthoDB_RBH_MapMan4",
"FastOMA_RBH_MapMan4",
"reject"
)
filter_counts = filter_counts[filter_criteria %in% desired_order]
filter_counts[, filter_criteria := factor(filter_criteria, levels = desired_order)]
setorder(filter_counts, filter_criteria)
print(filter_counts)## filter_criteria Count
## <fctr> <int>
## 1: MCScanX 17894
## 2: compara 5084
## 3: PLAZA 4926
## 4: OrthoDB_FastOMA_RBH_MapMan4 317
## 5: FastOMA_OrthoDB_MapMan4 731
## 6: OrthoDB_RBH_MapMan4 359
## 7: FastOMA_RBH_MapMan4 217
## 8: reject 41925
pss = ath.annot[which(!is.na(ath.annot$all_pathways)), ]
pss = pss[, grep("id$|all_pathways$|short_name$", colnames(pss))]
pss = pss[!duplicated(pss), ]
pss = merge(pss,
df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria",
names(dt), value = TRUE)],
by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss = pss[grep('^AT', pss$id), ]
pss = pss[!duplicated(pss), ]
table(pss$filter_criteria)##
## compara FastOMA_OrthoDB_MapMan4
## 145 34
## FastOMA_RBH_MapMan4 MCScanX
## 11 728
## OrthoDB_FastOMA_RBH_MapMan4 OrthoDB_RBH_MapMan4
## 25 16
## PLAZA reject
## 166 1269
openxlsx::write.xlsx(pss,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-24-strict.xlsx'),
asTable = TRUE)params_list <- list(
plantName = 'pdul'
, plantNameOut = "almond"
, plantDirOut = file.path('..', 'reports', group, "almond")
, flag = 2
)
# note: in compara - geneID and prot ID are completely different
env <- new.env()
list2env(params_list, envir = env)<environment: 0x00000260c64465b0>
##
##
## processing file: ./09_translate-child_strict.Rmd
| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-66] | |…… | 11% | |…….. | 15% [unnamed-chunk-67] | |……… | 19% | |……….. | 22% [unnamed-chunk-68] | |…………. | 26% | |…………… | 30% [unnamed-chunk-69] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-70] | |………………… | 41% | |………………….. | 44% [unnamed-chunk-71] | |……………………. | 48% | |…………………….. | 52% [unnamed-chunk-72] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-73] | |………………………….. | 63% | |……………………………. | 67% [unnamed-chunk-74] | |……………………………… | 70% | |……………………………….. | 74% [unnamed-chunk-75] | |…………………………………. | 78% | |…………………………………… | 81% [unnamed-chunk-76] | |……………………………………. | 85% | |……………………………………… | 89% [unnamed-chunk-77] | |……………………………………….. | 93% | |…………………………………………. | 96% [unnamed-chunk-78] | |……………………………………………| 100%
fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]
df = NULL
for (i in fl){
print(i)
dt = data.table::fread(i)
us = unique(dt$source)
if(us == 'compara') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'FastOMA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'MCScanX') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'PLAZA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'OrthoDB') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'RBH') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else cat ('Ignore source(s):', us, '\n')
}## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
## Pre filter Sources:
## 15975 42927 20148 35734 24829
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID"
df %>%
dplyr::group_by(source) %>%
dplyr::slice_head(n = 2) %>%
dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
dplyr::arrange(source) %>%
dplyr::ungroup() -> first_last_three_per_source
print(first_last_three_per_source, n = nrow(first_last_three_per_source))## # A tibble: 20 × 5
## from_geneID from_plant to_geneID to_plant source
## <chr> <chr> <chr> <chr> <chr>
## 1 AT2G05620 ath TexasF1_G1000 pdul FastOMA
## 2 AT3G48880 ath TexasF1_G10001 pdul FastOMA
## 3 AT3G48890 ath TexasF1_G9999 pdul FastOMA
## 4 AT5G52240 ath TexasF1_G9999 pdul FastOMA
## 5 AT1G04220 ath TexasF1_G767 pdul MCScanX
## 6 AT1G04230 ath TexasF1_G779 pdul MCScanX
## 7 ATCG00170 ath TexasF1_G22620 pdul MCScanX
## 8 ATCG00500 ath TexasF1_G22624 pdul MCScanX
## 9 AT1G23390 ath TexasF1_G3359 pdul OrthoDB
## 10 AT5G19210 ath TexasF1_G2060 pdul OrthoDB
## 11 AT3G51810 ath TexasF1_G23162 pdul OrthoDB
## 12 AT2G28815 ath TexasF1_G6420 pdul OrthoDB
## 13 AT1G01030 ath TexasF1_G18833 pdul RBH
## 14 AT1G01040 ath TexasF1_G9057 pdul RBH
## 15 ATMG00860 ath TexasF1_G25095 pdul RBH
## 16 ATMG01250 ath TexasF1_G22012 pdul RBH
## 17 AT4G37540 ath TexasF1_G22582 pdul compara
## 18 AT5G67420 ath TexasF1_G22582 pdul compara
## 19 AT4G15960 ath TexasF1_G27715 pdul compara
## 20 AT3G45140 ath TexasF1_G6796 pdul compara
rm(list = setdiff(ls(), c("df",
"ath.gmm", "ath.annot",
"group",
"plantName",
"plantNameOut",
"plantDirOut",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4014451 214.4 8762976 468 13692148 731.3
## Vcells 83929985 640.4 156357790 1193 195447236 1491.2
## [1] 22451
## [1] 20903
##
## compara FastOMA MCScanX OrthoDB RBH
## 15975 42927 20148 35734 24829
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {
source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
source_cols = c("MCScanX", "FastOMA", 'RBH')
}
dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]
hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))dff = as.data.frame(dt.wide)
upset_plot = upset(
dff,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods") +
theme(legend.position = "none")
# Print or/and save the plot
print(upset_plot)# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)
ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)),
grep('from_count', colnames(dt.wide)),
grep('to_count', colnames(dt.wide)),
grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, ath.annot, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)
nin = ath.annot[which(!(ath.annot$id[!is.na(ath.annot$all_pathways)] %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
setorder(nin, short_name)
openxlsx::write.xlsx(nin,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-24-strict.xlsx'),
asTable = TRUE) # change namefp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]
colnames(combined)[2:4] = paste('plant', colnames(combined)[2:4], sep = '_')
colnames(df)## [1] "from_geneID" "to_geneID" "FastOMA" "MCScanX"
## [5] "OrthoDB" "RBH" "compara" "from_count"
## [9] "to_count" "count_evidence" "ath_BINCODE" "ath_NAME"
## [13] "ath_DESCRIPTION" "all_pathways" "short_name" "athName"
## [17] "athSynonims"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$plant_BINCODE))##
## FALSE
## 67030
## character(0)
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]rm(list = setdiff(ls(), c("df",
"ath.gmm", "ath.annot",
"group",
"plantName",
"plantNameOut",
"plantDirOut",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3898023 208.2 8762976 468 13692148 731.3
## Vcells 57539806 439.0 156357790 1193 195447236 1491.2
MapMan Mercator matches: first three levels only
df = df[!duplicated(df), ]
compare_bin <- function(athMercator, plantXMercator) {
# Helper: Split and truncate tokens to first 3 dot-separated levels
split_tokens <- function(code) {
if (is.na(code) || trimws(code) == "") return(character(0))
parts <- unlist(strsplit(code, "\\|"))
tokens <- unlist(strsplit(parts, ";"))
tokens <- unique(trimws(tokens))
trunc3levels <- function(token) {
levels <- unlist(strsplit(token, "\\."))
paste(head(levels, 3), collapse = ".")
}
unique(sapply(tokens, trunc3levels))
}
bin_set <- split_tokens(athMercator)
v4_set <- split_tokens(plantXMercator)
# If both sets are empty, return "no match"
if (length(bin_set) == 0 && length(v4_set) == 0) {
return("no match")
}
# Check for redundant annotation (e.g. "35.2|35.2|35.2")
v4_parts <- unlist(strsplit(plantXMercator, "\\|"))
if (length(bin_set) == 1 &&
length(v4_parts) > 1 &&
all(split_tokens(plantXMercator) == bin_set)) {
result <- paste0("100% match based on ", bin_set)
if (result == "100% match based on 35.2") return("bad MapMan")
return(result)
}
# Check for exact match
if (setequal(bin_set, v4_set) && length(bin_set) > 0) {
result <- paste0("100% match based on ", paste(bin_set, collapse = ", "))
if (result == "100% match based on 35.2") return("bad MapMan")
return(result)
}
# Check for partial match
common_tokens <- intersect(bin_set, v4_set)
if (length(common_tokens) > 0) {
return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
}
return("no match")
}
df = df %>%
dplyr::rowwise() %>%
dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, plant_BINCODE)) %>% # change name
dplyr::ungroup()
table(df$ath_BINCODE[df$MapMan4_Match == "bad MapMan"], df$plant_BINCODE[df$MapMan4_Match == "bad MapMan"])##
## 35.2
## 35.2 14567
## #### #### before filter #### ####
cat("\t# ath unigue genes:", length(unique(df$from_geneID)), "\n",
"\t#", plantName, "unigue genes:", length(unique(df$to_geneID)), "\n",
"\t# ath highest connection degree: ", range(df$from_count)[2], "\n",
"\t#", plantName, "highest connection degree: ", range(df$to_count)[2], "\n",
"\t# genes in ath with degree > 30: ", length(unique(df$from_geneID[df$from_count > 30])), "\n",
"\t# genes in", plantName, "with degree > 30: ", length(unique(df$to_geneID[df$to_count > 30])), "\n\n"
)## # ath unigue genes: 22451
## # pdul unigue genes: 20903
## # ath highest connection degree: 150
## # pdul highest connection degree: 113
## # genes in ath with degree > 30: 171
## # genes in pdul with degree > 30: 120
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
if (flag == 1) {
methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) { # make flags
methods = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
methods = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
methods = c("MCScanX", "FastOMA", 'RBH')
}
match_categories = c("no match", "100% match", "partial match", "bad MapMan")
long_dt = rbindlist(lapply(methods, function(method) {
dt[, .(
Method = method,
Match_Type = match_categories,
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
)
)]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
fcase(
as.character(Match_Type) == "bad MapMan", "35.2",
default = as.character(Match_Type)
),
levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match bad MapMan no match partial match
## 39774 14567 7625 5064
##
## 100% match bad MapMan no match partial match
## 1 19498 7186 6154 4134
## 2 6174 1991 848 468
## 3 4138 1446 289 176
## 4 4954 1821 187 146
## 5 5010 2123 147 140
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Consistency of MapMan Match by # methods coverage",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
if (flag != 4) {
special_methods = c("OrthoDB", "FastOMA", 'RBH')
} else {
special_methods = c("FastOMA", 'RBH')
}
dt[, filter_criteria := "reject"]
covered_genes = character(nrow(dt)) # reset if needed
print(table(dt$filter_criteria))##
## reject
## 67030
priority_methods = setdiff(methods, special_methods)
dt[, use := TRUE]
method = NULL
for (method in priority_methods) {
# rows indices satisfying all conditions; . causes issue!
eligible_rows <- which(
dt$filter_criteria == "reject" &
dt[[method]] == TRUE &
dt$to_geneID %nin% covered_genes &
dt$use == TRUE
)
if(length(eligible_rows) > 0) {
# Update FILTER
dt[eligible_rows, filter_criteria := method]
# Update covered_genes
covered_genes = unique(c(covered_genes, dt[eligible_rows, to_geneID]))
# Update use
dt[to_geneID %in% covered_genes, use := FALSE]
} else {
cat("No eligible rows found for:", method, "\n")
}
}
# uncovered genes
eligible_rows = which(
dt$filter_criteria == "reject" &
dt$use == TRUE &
grepl("match based on", dt$MapMan4_Match, ignore.case = TRUE) # MapMan match
)
if (length(eligible_rows) > 0) {
sub_dt = dt[eligible_rows] # a temporary snapshot used for fast vectorized computation
# how many special methods are TRUE per row
method_matrix = sapply(special_methods, function(m) sub_dt[[m]])
count_covered = rowSums(method_matrix, na.rm = TRUE)
new_criteria = rep("reject", length(eligible_rows))
# For rows with 2 or 3 methods
for (j in seq_along(eligible_rows)) {
if (count_covered[j] == 3) {
new_criteria[j] = "OrthoDB_FastOMA_RBH_MapMan4"
} else if (count_covered[j] == 2) {
covered_by = special_methods[method_matrix[j, ]]
new_criteria[j] = paste0(paste(sort(covered_by), collapse = "_"), "_MapMan4")
}
}
# update
dt[eligible_rows, filter_criteria := new_criteria]
dt[eligible_rows, use := FALSE]
} else {
print("Nothing here!")
}
# table(dt$filter_criteria)
# table(dt[dt$use, MapMan4_Match])
dt[, use := NULL]
df = dt
data.table::fwrite(df,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-24-strict.txt'),
sep = '\t')
openxlsx::write.xlsx(df,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-24-strict.xlsx'),
asTable = TRUE)rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]
# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]
kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]
par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "ath", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "ath kept", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = plantName, xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = paste(plantName, "kept"), xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Degrees before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)long_dt = rbindlist(lapply(methods, function(method) {
kept[, .(
Method = method,
Match_Type = match_categories,
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
)
)]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
fcase(
as.character(Match_Type) == "bad MapMan", "35.2",
default = as.character(Match_Type)
),
levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method (post filter)",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match bad MapMan no match partial match
## 17566 6240 1707 974
##
## 100% match bad MapMan no match partial match
## 1 1171 694 690 253
## 2 3490 699 461 301
## 3 3215 994 230 144
## 4 4680 1730 179 136
## 5 5010 2123 147 140
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Consistency of MapMan Match by # methods coverage (post filter)",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria",
names(kept), value = TRUE)]
# rename 'bad MapMan' to '35.2'
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
keptsub$MapMan4_Match = ifelse(keptsub$MapMan4_Match == "bad MapMan", "35.2", keptsub$MapMan4_Match)
# frequency table
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
# Filter
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria,
# levels = c("MCScanX", "compara", "PLAZA",
# "OrthoDB_FastOMA_RBH",
# "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH",
# "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
levels = c("MCScanX", "compara", "PLAZA",
"OrthoDB_FastOMA_RBH_MapMan4",
"FastOMA_OrthoDB_MapMan4", "OrthoDB_RBH_MapMan4", "FastOMA_RBH_MapMan4"
))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', '35.2', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
labs(
title = "Selection by method and methods coverage",
x = "Filter criteria",
y = "Frequency",
fill = "MapMan4 Match"
) +
theme_minimal() +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
theme(
axis.text.x = element_text(angle = 90, hjust = 1),
panel.border = element_rect(color = "black", fill = NA, size = 1), # border around each facet
)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
openxlsx::write.xlsx(rejected,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-removed_2025-09-24-strict.xlsx'),
asTable = TRUE)
edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
geneID = names(comp$membership),
weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
# # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept,
paste0('../output/', plantNameOut , '-ath_orthologues-kept_2025-09-24-strict.xlsx'),
asTable = TRUE)
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) { # make flags
source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
source_cols = c("MCScanX", "FastOMA", 'RBH')
}
# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
kept,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")
# Print or save the plot
print(upset_plot)ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-24-strict.pdf"),
plot = upset_plot, width = 24, height = 6, device = "pdf")
cat('#### #### after filter #### #### \n')## #### #### after filter #### ####
cat("\t# ath unigue genes:", length(unique(kept$from_geneID)), "\n",
"\t#", plantName, "unigue genes:", length(unique(kept$to_geneID)), "\n",
"\t# ath highest connection degree: ", range(kept$from_count)[2], "\n",
"\t#", plantName, "highest connection degree: ", range(kept$to_count)[2], "\n",
"\t# genes in ath with degree > 30: ", length(unique(kept$from_geneID[kept$from_count > 30])), "\n",
"\t# genes in", plantName, "with degree > 30: ", length(unique(kept$to_geneID[kept$to_count > 30])), "\n\n"
)## # ath unigue genes: 18482
## # pdul unigue genes: 17119
## # ath highest connection degree: 22
## # pdul highest connection degree: 18
## # genes in ath with degree > 30: 0
## # genes in pdul with degree > 30: 0
filter_counts = as.data.table(table(dt$filter_criteria))
setnames(filter_counts, c("filter_criteria", "Count"))
# Define desired order
desired_order = c(
"MCScanX",
"compara",
"PLAZA",
# "OrthoDB_FastOMA_RBH",
# "OrthoDB_RBH",
# "FastOMA_OrthoDB",
# "FastOMA_RBH",
# "OrthoDB_MapMan4",
# "RBH_MapMan4",
# "FastOMA_MapMan4
"OrthoDB_FastOMA_RBH_MapMan4",
"FastOMA_OrthoDB_MapMan4",
"OrthoDB_RBH_MapMan4",
"FastOMA_RBH_MapMan4",
"reject"
)
filter_counts = filter_counts[filter_criteria %in% desired_order]
filter_counts[, filter_criteria := factor(filter_criteria, levels = desired_order)]
setorder(filter_counts, filter_criteria)
print(filter_counts)## filter_criteria Count
## <fctr> <int>
## 1: MCScanX 20148
## 2: compara 4234
## 3: OrthoDB_FastOMA_RBH_MapMan4 510
## 4: FastOMA_OrthoDB_MapMan4 697
## 5: OrthoDB_RBH_MapMan4 474
## 6: FastOMA_RBH_MapMan4 424
## 7: reject 40543
pss = ath.annot[which(!is.na(ath.annot$all_pathways)), ]
pss = pss[, grep("id$|all_pathways$|short_name$", colnames(pss))]
pss = pss[!duplicated(pss), ]
pss = merge(pss,
df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria",
names(dt), value = TRUE)],
by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss = pss[grep('^AT', pss$id), ]
pss = pss[!duplicated(pss), ]
table(pss$filter_criteria)##
## compara FastOMA_OrthoDB_MapMan4
## 118 47
## FastOMA_RBH_MapMan4 MCScanX
## 18 843
## OrthoDB_FastOMA_RBH_MapMan4 OrthoDB_RBH_MapMan4
## 42 17
## reject
## 1320
openxlsx::write.xlsx(pss,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-24-strict.xlsx'),
asTable = TRUE)params_list <- list(
plantName = 'pavi'
, plantNameOut = "wildcherry"
, plantDirOut = file.path('..', 'reports', group, "wildcherry")
, flag = 2
)
# note: in compara - geneID and prot ID are completely different
env <- new.env()
list2env(params_list, envir = env)<environment: 0x00000260c45bc740>
##
##
## processing file: ./09_translate-child_strict.Rmd
| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-94] | |…… | 11% | |……. | 15% [unnamed-chunk-95] | |……… | 19% | |……….. | 22% [unnamed-chunk-96] | |…………. | 26% | |…………… | 30% [unnamed-chunk-97] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-98] | |……………….. | 41% | |…………………. | 44% [unnamed-chunk-99] | |…………………… | 48% | |…………………….. | 52% [unnamed-chunk-100] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-101] | |…………………………. | 63% | |…………………………… | 67% [unnamed-chunk-102] | |…………………………….. | 70% | |………………………………. | 74% [unnamed-chunk-103] | |………………………………… | 78% | |………………………………….. | 81% [unnamed-chunk-104] | |……………………………………. | 85% | |…………………………………….. | 89% [unnamed-chunk-105] | |………………………………………. | 93% | |………………………………………… | 96% [unnamed-chunk-106] | |…………………………………………..| 100%
fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]
df = NULL
for (i in fl){
print(i)
dt = data.table::fread(i)
us = unique(dt$source)
if(us == 'compara') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'FastOMA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'MCScanX') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'PLAZA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'OrthoDB') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'RBH') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else cat ('Ignore source(s):', us, '\n')
}## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
## Pre filter Sources:
## 4367 45924 19708 38228 25594
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID"
df %>%
dplyr::group_by(source) %>%
dplyr::slice_head(n = 2) %>%
dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
dplyr::arrange(source) %>%
dplyr::ungroup() -> first_last_three_per_source
print(first_last_three_per_source, n = nrow(first_last_three_per_source))## # A tibble: 20 × 5
## from_geneID from_plant to_geneID to_plant source
## <chr> <chr> <chr> <chr> <chr>
## 1 AT1G12040 ath FUN_000050 pavi FastOMA
## 2 AT1G62440 ath FUN_000050 pavi FastOMA
## 3 AT2G43840 ath FUN_040335 pavi FastOMA
## 4 AT2G44050 ath FUN_040336 pavi FastOMA
## 5 AT1G04220 ath FUN_000300 pavi MCScanX
## 6 AT1G04230 ath FUN_000316 pavi MCScanX
## 7 ATMG01190 ath FUN_026738 pavi MCScanX
## 8 ATMG00910 ath FUN_040205 pavi MCScanX
## 9 AT4G39370 ath FUN_020728 pavi OrthoDB
## 10 AT3G06350 ath FUN_020749 pavi OrthoDB
## 11 AT4G24220 ath FUN_029917 pavi OrthoDB
## 12 AT4G24220 ath FUN_029968 pavi OrthoDB
## 13 AT1G01030 ath FUN_025493 pavi RBH
## 14 AT1G01040 ath FUN_011748 pavi RBH
## 15 ATMG01250 ath FUN_040221 pavi RBH
## 16 ATMG01360 ath FUN_026804 pavi RBH
## 17 AT2G22690 ath FUN_021390 pavi compara
## 18 AT4G37880 ath FUN_021390 pavi compara
## 19 AT4G39770 ath FUN_021012 pavi compara
## 20 AT4G21740 ath FUN_032538 pavi compara
rm(list = setdiff(ls(), c("df",
"ath.gmm", "ath.annot",
"group",
"plantName",
"plantNameOut",
"plantDirOut",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3137040 167.6 8762977 468 13692148 731.3
## Vcells 53926859 411.5 156357790 1193 195447236 1491.2
## [1] 22167
## [1] 21950
##
## compara FastOMA MCScanX OrthoDB RBH
## 4367 45924 19708 38228 25594
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {
source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
source_cols = c("MCScanX", "FastOMA", 'RBH')
}
dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]
hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))dff = as.data.frame(dt.wide)
upset_plot = upset(
dff,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods") +
theme(legend.position = "none")
# Print or/and save the plot
print(upset_plot)# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)
ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)),
grep('from_count', colnames(dt.wide)),
grep('to_count', colnames(dt.wide)),
grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, ath.annot, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)
nin = ath.annot[which(!(ath.annot$id[!is.na(ath.annot$all_pathways)] %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
setorder(nin, short_name)
openxlsx::write.xlsx(nin,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-24-strict.xlsx'),
asTable = TRUE) # change namefp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]
colnames(combined)[2:4] = paste('plant', colnames(combined)[2:4], sep = '_')
colnames(df)## [1] "from_geneID" "to_geneID" "FastOMA" "MCScanX"
## [5] "OrthoDB" "RBH" "compara" "from_count"
## [9] "to_count" "count_evidence" "ath_BINCODE" "ath_NAME"
## [13] "ath_DESCRIPTION" "all_pathways" "short_name" "athName"
## [17] "athSynonims"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$plant_BINCODE))##
## FALSE TRUE
## 71643 2
## [1] "FUN_040149" "FUN_040149"
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]rm(list = setdiff(ls(), c("df",
"ath.gmm", "ath.annot",
"group",
"plantName",
"plantNameOut",
"plantDirOut",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3957200 211.4 8762977 468 13692148 731.3
## Vcells 58643052 447.5 156357790 1193 195447236 1491.2
MapMan Mercator matches: first three levels only
df = df[!duplicated(df), ]
compare_bin <- function(athMercator, plantXMercator) {
# Helper: Split and truncate tokens to first 3 dot-separated levels
split_tokens <- function(code) {
if (is.na(code) || trimws(code) == "") return(character(0))
parts <- unlist(strsplit(code, "\\|"))
tokens <- unlist(strsplit(parts, ";"))
tokens <- unique(trimws(tokens))
trunc3levels <- function(token) {
levels <- unlist(strsplit(token, "\\."))
paste(head(levels, 3), collapse = ".")
}
unique(sapply(tokens, trunc3levels))
}
bin_set <- split_tokens(athMercator)
v4_set <- split_tokens(plantXMercator)
# If both sets are empty, return "no match"
if (length(bin_set) == 0 && length(v4_set) == 0) {
return("no match")
}
# Check for redundant annotation (e.g. "35.2|35.2|35.2")
v4_parts <- unlist(strsplit(plantXMercator, "\\|"))
if (length(bin_set) == 1 &&
length(v4_parts) > 1 &&
all(split_tokens(plantXMercator) == bin_set)) {
result <- paste0("100% match based on ", bin_set)
if (result == "100% match based on 35.2") return("bad MapMan")
return(result)
}
# Check for exact match
if (setequal(bin_set, v4_set) && length(bin_set) > 0) {
result <- paste0("100% match based on ", paste(bin_set, collapse = ", "))
if (result == "100% match based on 35.2") return("bad MapMan")
return(result)
}
# Check for partial match
common_tokens <- intersect(bin_set, v4_set)
if (length(common_tokens) > 0) {
return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
}
return("no match")
}
df = df %>%
dplyr::rowwise() %>%
dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, plant_BINCODE)) %>% # change name
dplyr::ungroup()
table(df$ath_BINCODE[df$MapMan4_Match == "bad MapMan"], df$plant_BINCODE[df$MapMan4_Match == "bad MapMan"])##
## 35.2
## 35.2 15130
## #### #### before filter #### ####
cat("\t# ath unigue genes:", length(unique(df$from_geneID)), "\n",
"\t#", plantName, "unigue genes:", length(unique(df$to_geneID)), "\n",
"\t# ath highest connection degree: ", range(df$from_count)[2], "\n",
"\t#", plantName, "highest connection degree: ", range(df$to_count)[2], "\n",
"\t# genes in ath with degree > 30: ", length(unique(df$from_geneID[df$from_count > 30])), "\n",
"\t# genes in", plantName, "with degree > 30: ", length(unique(df$to_geneID[df$to_count > 30])), "\n\n"
)## # ath unigue genes: 22167
## # pavi unigue genes: 21950
## # ath highest connection degree: 122
## # pavi highest connection degree: 115
## # genes in ath with degree > 30: 242
## # genes in pavi with degree > 30: 131
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
if (flag == 1) {
methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) { # make flags
methods = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
methods = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
methods = c("MCScanX", "FastOMA", 'RBH')
}
match_categories = c("no match", "100% match", "partial match", "bad MapMan")
long_dt = rbindlist(lapply(methods, function(method) {
dt[, .(
Method = method,
Match_Type = match_categories,
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
)
)]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
fcase(
as.character(Match_Type) == "bad MapMan", "35.2",
default = as.character(Match_Type)
),
levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match bad MapMan no match partial match
## 41733 15130 8890 5892
##
## 100% match bad MapMan no match partial match
## 1 20986 7805 7489 4881
## 2 7358 2253 946 572
## 3 6191 2096 298 243
## 4 6023 2400 131 163
## 5 1175 576 26 33
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Consistency of MapMan Match by # methods coverage",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
if (flag != 4) {
special_methods = c("OrthoDB", "FastOMA", 'RBH')
} else {
special_methods = c("FastOMA", 'RBH')
}
dt[, filter_criteria := "reject"]
covered_genes = character(nrow(dt)) # reset if needed
print(table(dt$filter_criteria))##
## reject
## 71645
priority_methods = setdiff(methods, special_methods)
dt[, use := TRUE]
method = NULL
for (method in priority_methods) {
# rows indices satisfying all conditions; . causes issue!
eligible_rows <- which(
dt$filter_criteria == "reject" &
dt[[method]] == TRUE &
dt$to_geneID %nin% covered_genes &
dt$use == TRUE
)
if(length(eligible_rows) > 0) {
# Update FILTER
dt[eligible_rows, filter_criteria := method]
# Update covered_genes
covered_genes = unique(c(covered_genes, dt[eligible_rows, to_geneID]))
# Update use
dt[to_geneID %in% covered_genes, use := FALSE]
} else {
cat("No eligible rows found for:", method, "\n")
}
}
# uncovered genes
eligible_rows = which(
dt$filter_criteria == "reject" &
dt$use == TRUE &
grepl("match based on", dt$MapMan4_Match, ignore.case = TRUE) # MapMan match
)
if (length(eligible_rows) > 0) {
sub_dt = dt[eligible_rows] # a temporary snapshot used for fast vectorized computation
# how many special methods are TRUE per row
method_matrix = sapply(special_methods, function(m) sub_dt[[m]])
count_covered = rowSums(method_matrix, na.rm = TRUE)
new_criteria = rep("reject", length(eligible_rows))
# For rows with 2 or 3 methods
for (j in seq_along(eligible_rows)) {
if (count_covered[j] == 3) {
new_criteria[j] = "OrthoDB_FastOMA_RBH_MapMan4"
} else if (count_covered[j] == 2) {
covered_by = special_methods[method_matrix[j, ]]
new_criteria[j] = paste0(paste(sort(covered_by), collapse = "_"), "_MapMan4")
}
}
# update
dt[eligible_rows, filter_criteria := new_criteria]
dt[eligible_rows, use := FALSE]
} else {
print("Nothing here!")
}
# table(dt$filter_criteria)
# table(dt[dt$use, MapMan4_Match])
dt[, use := NULL]
df = dt
data.table::fwrite(df,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-24-strict.txt'),
sep = '\t')
openxlsx::write.xlsx(df,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-24-strict.xlsx'),
asTable = TRUE)rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]
# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]
kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]
par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "ath", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "ath kept", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = plantName, xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = paste(plantName, "kept"), xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Degrees before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)long_dt = rbindlist(lapply(methods, function(method) {
kept[, .(
Method = method,
Match_Type = match_categories,
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
)
)]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
fcase(
as.character(Match_Type) == "bad MapMan", "35.2",
default = as.character(Match_Type)
),
levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method (post filter)",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match bad MapMan no match partial match
## 18503 5202 1518 1055
##
## 100% match bad MapMan no match partial match
## 1 1074 513 732 249
## 2 4956 670 415 395
## 3 5345 1075 216 215
## 4 5953 2368 129 163
## 5 1175 576 26 33
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Consistency of MapMan Match by # methods coverage (post filter)",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria",
names(kept), value = TRUE)]
# rename 'bad MapMan' to '35.2'
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
keptsub$MapMan4_Match = ifelse(keptsub$MapMan4_Match == "bad MapMan", "35.2", keptsub$MapMan4_Match)
# frequency table
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
# Filter
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria,
# levels = c("MCScanX", "compara", "PLAZA",
# "OrthoDB_FastOMA_RBH",
# "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH",
# "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
levels = c("MCScanX", "compara", "PLAZA",
"OrthoDB_FastOMA_RBH_MapMan4",
"FastOMA_OrthoDB_MapMan4", "OrthoDB_RBH_MapMan4", "FastOMA_RBH_MapMan4"
))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', '35.2', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
labs(
title = "Selection by method and methods coverage",
x = "Filter criteria",
y = "Frequency",
fill = "MapMan4 Match"
) +
theme_minimal() +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
theme(
axis.text.x = element_text(angle = 90, hjust = 1),
panel.border = element_rect(color = "black", fill = NA, size = 1), # border around each facet
)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
openxlsx::write.xlsx(rejected,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-removed_2025-09-24-strict.xlsx'),
asTable = TRUE)
edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
geneID = names(comp$membership),
weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
# # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept,
paste0('../output/', plantNameOut , '-ath_orthologues-kept_2025-09-24-strict.xlsx'),
asTable = TRUE)
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) { # make flags
source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
source_cols = c("MCScanX", "FastOMA", 'RBH')
}
# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
kept,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")
# Print or save the plot
print(upset_plot)ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-24-strict.pdf"),
plot = upset_plot, width = 24, height = 6, device = "pdf")
cat('#### #### after filter #### #### \n')## #### #### after filter #### ####
cat("\t# ath unigue genes:", length(unique(kept$from_geneID)), "\n",
"\t#", plantName, "unigue genes:", length(unique(kept$to_geneID)), "\n",
"\t# ath highest connection degree: ", range(kept$from_count)[2], "\n",
"\t#", plantName, "highest connection degree: ", range(kept$to_count)[2], "\n",
"\t# genes in ath with degree > 30: ", length(unique(kept$from_geneID[kept$from_count > 30])), "\n",
"\t# genes in", plantName, "with degree > 30: ", length(unique(kept$to_geneID[kept$to_count > 30])), "\n\n"
)## # ath unigue genes: 17405
## # pavi unigue genes: 16539
## # ath highest connection degree: 31
## # pavi highest connection degree: 16
## # genes in ath with degree > 30: 2
## # genes in pavi with degree > 30: 0
filter_counts = as.data.table(table(dt$filter_criteria))
setnames(filter_counts, c("filter_criteria", "Count"))
# Define desired order
desired_order = c(
"MCScanX",
"compara",
"PLAZA",
# "OrthoDB_FastOMA_RBH",
# "OrthoDB_RBH",
# "FastOMA_OrthoDB",
# "FastOMA_RBH",
# "OrthoDB_MapMan4",
# "RBH_MapMan4",
# "FastOMA_MapMan4
"OrthoDB_FastOMA_RBH_MapMan4",
"FastOMA_OrthoDB_MapMan4",
"OrthoDB_RBH_MapMan4",
"FastOMA_RBH_MapMan4",
"reject"
)
filter_counts = filter_counts[filter_criteria %in% desired_order]
filter_counts[, filter_criteria := factor(filter_criteria, levels = desired_order)]
setorder(filter_counts, filter_criteria)
print(filter_counts)## filter_criteria Count
## <fctr> <int>
## 1: MCScanX 19708
## 2: compara 1309
## 3: OrthoDB_FastOMA_RBH_MapMan4 2158
## 4: FastOMA_OrthoDB_MapMan4 1457
## 5: OrthoDB_RBH_MapMan4 923
## 6: FastOMA_RBH_MapMan4 723
## 7: reject 45367
pss = ath.annot[which(!is.na(ath.annot$all_pathways)), ]
pss = pss[, grep("id$|all_pathways$|short_name$", colnames(pss))]
pss = pss[!duplicated(pss), ]
pss = merge(pss,
df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria",
names(dt), value = TRUE)],
by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss = pss[grep('^AT', pss$id), ]
pss = pss[!duplicated(pss), ]
table(pss$filter_criteria)##
## compara FastOMA_OrthoDB_MapMan4
## 47 65
## FastOMA_RBH_MapMan4 MCScanX
## 34 789
## OrthoDB_FastOMA_RBH_MapMan4 OrthoDB_RBH_MapMan4
## 77 54
## reject
## 1683
openxlsx::write.xlsx(pss,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-24-strict.xlsx'),
asTable = TRUE)params_list <- list(
plantName = 'parm'
, plantNameOut = "apricot"
, plantDirOut = file.path('..', 'reports', group, "apricot")
, flag = 3
)
# note: in compara - geneID and prot ID are completely different
env <- new.env()
list2env(params_list, envir = env)<environment: 0x00000260b994a690>
##
##
## processing file: ./09_translate-child_strict.Rmd
| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-122] | |…… | 11% | |……. | 15% [unnamed-chunk-123] | |……… | 19% | |……….. | 22% [unnamed-chunk-124] | |…………. | 26% | |…………… | 30% [unnamed-chunk-125] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-126] | |……………….. | 41% | |…………………. | 44% [unnamed-chunk-127] | |…………………… | 48% | |…………………….. | 52% [unnamed-chunk-128] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-129] | |…………………………. | 63% | |…………………………… | 67% [unnamed-chunk-130] | |…………………………….. | 70% | |………………………………. | 74% [unnamed-chunk-131] | |………………………………… | 78% | |………………………………….. | 81% [unnamed-chunk-132] | |……………………………………. | 85% | |…………………………………….. | 89% [unnamed-chunk-133] | |………………………………………. | 93% | |………………………………………… | 96% [unnamed-chunk-134] | |…………………………………………..| 100%
fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]
df = NULL
for (i in fl){
print(i)
dt = data.table::fread(i)
us = unique(dt$source)
if(us == 'compara') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'FastOMA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'MCScanX') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'PLAZA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'OrthoDB') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'RBH') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else cat ('Ignore source(s):', us, '\n')
}## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
## Pre filter Sources:
## 43038 18615 52084 25259
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID"
df %>%
dplyr::group_by(source) %>%
dplyr::slice_head(n = 2) %>%
dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
dplyr::arrange(source) %>%
dplyr::ungroup() -> first_last_three_per_source
print(first_last_three_per_source, n = nrow(first_last_three_per_source))## # A tibble: 16 × 5
## from_geneID from_plant to_geneID to_plant source
## <chr> <chr> <chr> <chr> <chr>
## 1 AT1G12040 ath PruarM.1G000500 parm FastOMA
## 2 AT1G62440 ath PruarM.1G000500 parm FastOMA
## 3 AT2G47410 ath PruarM.8G368500 parm FastOMA
## 4 AT4G02060 ath PruarM.8G368700 parm FastOMA
## 5 AT1G04400 ath PruarM.1G051900 parm MCScanX
## 6 AT1G04410 ath PruarM.1G052500 parm MCScanX
## 7 AT5G64410 ath PruarM.8G146300 parm MCScanX
## 8 AT5G64410 ath PruarM.8G146200 parm MCScanX
## 9 AT5G10270 ath PruarM.1G279700 parm OrthoDB
## 10 AT5G64960 ath PruarM.1G279700 parm OrthoDB
## 11 AT2G15790 ath PruarM.8G195500 parm OrthoDB
## 12 AT4G34660 ath PruarM.8G163400 parm OrthoDB
## 13 AT1G01010 ath PruarM.2G368400 parm RBH
## 14 AT1G01030 ath PruarM.5G193300 parm RBH
## 15 ATMG01360 ath PruarM.4G189900 parm RBH
## 16 ATMG01360 ath PruarM.4G190100 parm RBH
rm(list = setdiff(ls(), c("df",
"ath.gmm", "ath.annot",
"group",
"plantName",
"plantNameOut",
"plantDirOut",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3114878 166.4 8762977 468 13692148 731.3
## Vcells 52632599 401.6 156357790 1193 195447236 1491.2
## [1] 22351
## [1] 22551
##
## FastOMA MCScanX OrthoDB RBH
## 43038 18615 52084 25259
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {
source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
source_cols = c("MCScanX", "FastOMA", 'RBH')
}
dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]
hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))dff = as.data.frame(dt.wide)
upset_plot = upset(
dff,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods") +
theme(legend.position = "none")
# Print or/and save the plot
print(upset_plot)# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)
ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)),
grep('from_count', colnames(dt.wide)),
grep('to_count', colnames(dt.wide)),
grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, ath.annot, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)
nin = ath.annot[which(!(ath.annot$id[!is.na(ath.annot$all_pathways)] %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
setorder(nin, short_name)
openxlsx::write.xlsx(nin,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-24-strict.xlsx'),
asTable = TRUE) # change namefp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]
colnames(combined)[2:4] = paste('plant', colnames(combined)[2:4], sep = '_')
colnames(df)## [1] "from_geneID" "to_geneID" "FastOMA" "MCScanX"
## [5] "OrthoDB" "RBH" "from_count" "to_count"
## [9] "count_evidence" "ath_BINCODE" "ath_NAME" "ath_DESCRIPTION"
## [13] "all_pathways" "short_name" "athName" "athSynonims"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$plant_BINCODE))##
## FALSE
## 84042
## character(0)
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]rm(list = setdiff(ls(), c("df",
"ath.gmm", "ath.annot",
"group",
"plantName",
"plantNameOut",
"plantDirOut",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3298180 176.2 8762977 468 13692148 731.3
## Vcells 46553594 355.2 156357790 1193 195447236 1491.2
MapMan Mercator matches: first three levels only
df = df[!duplicated(df), ]
compare_bin <- function(athMercator, plantXMercator) {
# Helper: Split and truncate tokens to first 3 dot-separated levels
split_tokens <- function(code) {
if (is.na(code) || trimws(code) == "") return(character(0))
parts <- unlist(strsplit(code, "\\|"))
tokens <- unlist(strsplit(parts, ";"))
tokens <- unique(trimws(tokens))
trunc3levels <- function(token) {
levels <- unlist(strsplit(token, "\\."))
paste(head(levels, 3), collapse = ".")
}
unique(sapply(tokens, trunc3levels))
}
bin_set <- split_tokens(athMercator)
v4_set <- split_tokens(plantXMercator)
# If both sets are empty, return "no match"
if (length(bin_set) == 0 && length(v4_set) == 0) {
return("no match")
}
# Check for redundant annotation (e.g. "35.2|35.2|35.2")
v4_parts <- unlist(strsplit(plantXMercator, "\\|"))
if (length(bin_set) == 1 &&
length(v4_parts) > 1 &&
all(split_tokens(plantXMercator) == bin_set)) {
result <- paste0("100% match based on ", bin_set)
if (result == "100% match based on 35.2") return("bad MapMan")
return(result)
}
# Check for exact match
if (setequal(bin_set, v4_set) && length(bin_set) > 0) {
result <- paste0("100% match based on ", paste(bin_set, collapse = ", "))
if (result == "100% match based on 35.2") return("bad MapMan")
return(result)
}
# Check for partial match
common_tokens <- intersect(bin_set, v4_set)
if (length(common_tokens) > 0) {
return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
}
return("no match")
}
df = df %>%
dplyr::rowwise() %>%
dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, plant_BINCODE)) %>% # change name
dplyr::ungroup()
table(df$ath_BINCODE[df$MapMan4_Match == "bad MapMan"], df$plant_BINCODE[df$MapMan4_Match == "bad MapMan"])##
## 35.2
## 35.2 27414
## #### #### before filter #### ####
cat("\t# ath unigue genes:", length(unique(df$from_geneID)), "\n",
"\t#", plantName, "unigue genes:", length(unique(df$to_geneID)), "\n",
"\t# ath highest connection degree: ", range(df$from_count)[2], "\n",
"\t#", plantName, "highest connection degree: ", range(df$to_count)[2], "\n",
"\t# genes in ath with degree > 30: ", length(unique(df$from_geneID[df$from_count > 30])), "\n",
"\t# genes in", plantName, "with degree > 30: ", length(unique(df$to_geneID[df$to_count > 30])), "\n\n"
)## # ath unigue genes: 22351
## # parm unigue genes: 22551
## # ath highest connection degree: 267
## # parm highest connection degree: 113
## # genes in ath with degree > 30: 344
## # genes in parm with degree > 30: 392
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
if (flag == 1) {
methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) { # make flags
methods = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
methods = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
methods = c("MCScanX", "FastOMA", 'RBH')
}
match_categories = c("no match", "100% match", "partial match", "bad MapMan")
long_dt = rbindlist(lapply(methods, function(method) {
dt[, .(
Method = method,
Match_Type = match_categories,
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
)
)]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
fcase(
as.character(Match_Type) == "bad MapMan", "35.2",
default = as.character(Match_Type)
),
levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match bad MapMan no match partial match
## 40370 27414 10212 6046
##
## 100% match bad MapMan no match partial match
## 1 20994 20423 8961 4950
## 2 7132 2400 896 600
## 3 6352 2247 227 306
## 4 5892 2344 128 190
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Consistency of MapMan Match by # methods coverage",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
if (flag != 4) {
special_methods = c("OrthoDB", "FastOMA", 'RBH')
} else {
special_methods = c("FastOMA", 'RBH')
}
dt[, filter_criteria := "reject"]
covered_genes = character(nrow(dt)) # reset if needed
print(table(dt$filter_criteria))##
## reject
## 84042
priority_methods = setdiff(methods, special_methods)
dt[, use := TRUE]
method = NULL
for (method in priority_methods) {
# rows indices satisfying all conditions; . causes issue!
eligible_rows <- which(
dt$filter_criteria == "reject" &
dt[[method]] == TRUE &
dt$to_geneID %nin% covered_genes &
dt$use == TRUE
)
if(length(eligible_rows) > 0) {
# Update FILTER
dt[eligible_rows, filter_criteria := method]
# Update covered_genes
covered_genes = unique(c(covered_genes, dt[eligible_rows, to_geneID]))
# Update use
dt[to_geneID %in% covered_genes, use := FALSE]
} else {
cat("No eligible rows found for:", method, "\n")
}
}
# uncovered genes
eligible_rows = which(
dt$filter_criteria == "reject" &
dt$use == TRUE &
grepl("match based on", dt$MapMan4_Match, ignore.case = TRUE) # MapMan match
)
if (length(eligible_rows) > 0) {
sub_dt = dt[eligible_rows] # a temporary snapshot used for fast vectorized computation
# how many special methods are TRUE per row
method_matrix = sapply(special_methods, function(m) sub_dt[[m]])
count_covered = rowSums(method_matrix, na.rm = TRUE)
new_criteria = rep("reject", length(eligible_rows))
# For rows with 2 or 3 methods
for (j in seq_along(eligible_rows)) {
if (count_covered[j] == 3) {
new_criteria[j] = "OrthoDB_FastOMA_RBH_MapMan4"
} else if (count_covered[j] == 2) {
covered_by = special_methods[method_matrix[j, ]]
new_criteria[j] = paste0(paste(sort(covered_by), collapse = "_"), "_MapMan4")
}
}
# update
dt[eligible_rows, filter_criteria := new_criteria]
dt[eligible_rows, use := FALSE]
} else {
print("Nothing here!")
}
# table(dt$filter_criteria)
# table(dt[dt$use, MapMan4_Match])
dt[, use := NULL]
df = dt
data.table::fwrite(df,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-24-strict.txt'),
sep = '\t')
openxlsx::write.xlsx(df,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-24-strict.xlsx'),
asTable = TRUE)rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]
# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]
kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]
par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "ath", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "ath kept", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = plantName, xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = paste(plantName, "kept"), xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Degrees before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)long_dt = rbindlist(lapply(methods, function(method) {
kept[, .(
Method = method,
Match_Type = match_categories,
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
)
)]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
fcase(
as.character(Match_Type) == "bad MapMan", "35.2",
default = as.character(Match_Type)
),
levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method (post filter)",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match bad MapMan no match partial match
## 17465 4442 1221 1153
##
## 100% match bad MapMan no match partial match
## 1 1147 545 606 253
## 2 4818 534 342 438
## 3 5608 1019 145 272
## 4 5892 2344 128 190
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Consistency of MapMan Match by # methods coverage (post filter)",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria",
names(kept), value = TRUE)]
# rename 'bad MapMan' to '35.2'
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
keptsub$MapMan4_Match = ifelse(keptsub$MapMan4_Match == "bad MapMan", "35.2", keptsub$MapMan4_Match)
# frequency table
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
# Filter
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria,
# levels = c("MCScanX", "compara", "PLAZA",
# "OrthoDB_FastOMA_RBH",
# "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH",
# "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
levels = c("MCScanX", "compara", "PLAZA",
"OrthoDB_FastOMA_RBH_MapMan4",
"FastOMA_OrthoDB_MapMan4", "OrthoDB_RBH_MapMan4", "FastOMA_RBH_MapMan4"
))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', '35.2', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
labs(
title = "Selection by method and methods coverage",
x = "Filter criteria",
y = "Frequency",
fill = "MapMan4 Match"
) +
theme_minimal() +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
theme(
axis.text.x = element_text(angle = 90, hjust = 1),
panel.border = element_rect(color = "black", fill = NA, size = 1), # border around each facet
)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
openxlsx::write.xlsx(rejected,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-removed_2025-09-24-strict.xlsx'),
asTable = TRUE)
edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
geneID = names(comp$membership),
weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
# # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept,
paste0('../output/', plantNameOut , '-ath_orthologues-kept_2025-09-24-strict.xlsx'),
asTable = TRUE)
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) { # make flags
source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
source_cols = c("MCScanX", "FastOMA", 'RBH')
}
# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
kept,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")
# Print or save the plot
print(upset_plot)ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-24-strict.pdf"),
plot = upset_plot, width = 24, height = 6, device = "pdf")
cat('#### #### after filter #### #### \n')## #### #### after filter #### ####
cat("\t# ath unigue genes:", length(unique(kept$from_geneID)), "\n",
"\t#", plantName, "unigue genes:", length(unique(kept$to_geneID)), "\n",
"\t# ath highest connection degree: ", range(kept$from_count)[2], "\n",
"\t#", plantName, "highest connection degree: ", range(kept$to_count)[2], "\n",
"\t# genes in ath with degree > 30: ", length(unique(kept$from_geneID[kept$from_count > 30])), "\n",
"\t# genes in", plantName, "with degree > 30: ", length(unique(kept$to_geneID[kept$to_count > 30])), "\n\n"
)## # ath unigue genes: 16773
## # parm unigue genes: 15078
## # ath highest connection degree: 29
## # parm highest connection degree: 19
## # genes in ath with degree > 30: 0
## # genes in parm with degree > 30: 0
filter_counts = as.data.table(table(dt$filter_criteria))
setnames(filter_counts, c("filter_criteria", "Count"))
# Define desired order
desired_order = c(
"MCScanX",
"compara",
"PLAZA",
# "OrthoDB_FastOMA_RBH",
# "OrthoDB_RBH",
# "FastOMA_OrthoDB",
# "FastOMA_RBH",
# "OrthoDB_MapMan4",
# "RBH_MapMan4",
# "FastOMA_MapMan4
"OrthoDB_FastOMA_RBH_MapMan4",
"FastOMA_OrthoDB_MapMan4",
"OrthoDB_RBH_MapMan4",
"FastOMA_RBH_MapMan4",
"reject"
)
filter_counts = filter_counts[filter_criteria %in% desired_order]
filter_counts[, filter_criteria := factor(filter_criteria, levels = desired_order)]
setorder(filter_counts, filter_criteria)
print(filter_counts)## filter_criteria Count
## <fctr> <int>
## 1: MCScanX 18615
## 2: OrthoDB_FastOMA_RBH_MapMan4 2608
## 3: FastOMA_OrthoDB_MapMan4 1415
## 4: OrthoDB_RBH_MapMan4 893
## 5: FastOMA_RBH_MapMan4 750
## 6: reject 59761
pss = ath.annot[which(!is.na(ath.annot$all_pathways)), ]
pss = pss[, grep("id$|all_pathways$|short_name$", colnames(pss))]
pss = pss[!duplicated(pss), ]
pss = merge(pss,
df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria",
names(dt), value = TRUE)],
by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss = pss[grep('^AT', pss$id), ]
pss = pss[!duplicated(pss), ]
table(pss$filter_criteria)##
## FastOMA_OrthoDB_MapMan4 FastOMA_RBH_MapMan4
## 91 26
## MCScanX OrthoDB_FastOMA_RBH_MapMan4
## 768 115
## OrthoDB_RBH_MapMan4 reject
## 53 1822
openxlsx::write.xlsx(pss,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-24-strict.xlsx'),
asTable = TRUE)params_list <- list(
plantName = 'pcox'
, plantNameOut = "pear"
, plantDirOut = file.path('..', 'reports', group, "pear")
, flag = 4
)
env <- new.env()
list2env(params_list, envir = env)<environment: 0x00000260bf88ac80>
##
##
## processing file: ./09_translate-child_strict.Rmd
| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-150] | |…… | 11% | |……. | 15% [unnamed-chunk-151] | |……… | 19% | |……….. | 22% [unnamed-chunk-152] | |…………. | 26% | |…………… | 30% [unnamed-chunk-153] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-154] | |……………….. | 41% | |…………………. | 44% [unnamed-chunk-155] | |…………………… | 48% | |…………………….. | 52% [unnamed-chunk-156] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-157] | |…………………………. | 63% | |…………………………… | 67% [unnamed-chunk-158] | |…………………………….. | 70% | |………………………………. | 74% [unnamed-chunk-159] | |………………………………… | 78% | |………………………………….. | 81% [unnamed-chunk-160] | |……………………………………. | 85% | |…………………………………….. | 89% [unnamed-chunk-161] | |………………………………………. | 93% | |………………………………………… | 96% [unnamed-chunk-162] | |…………………………………………..| 100%
fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]
df = NULL
for (i in fl){
print(i)
dt = data.table::fread(i)
us = unique(dt$source)
if(us == 'compara') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'FastOMA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'MCScanX') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'PLAZA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'OrthoDB') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'RBH') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else cat ('Ignore source(s):', us, '\n')
}## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
## Pre filter Sources:
## 75969 33983 36090
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID"
df %>%
dplyr::group_by(source) %>%
dplyr::slice_head(n = 2) %>%
dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
dplyr::arrange(source) %>%
dplyr::ungroup() -> first_last_three_per_source
print(first_last_three_per_source, n = nrow(first_last_three_per_source))## # A tibble: 12 × 5
## from_geneID from_plant to_geneID to_plant source
## <chr> <chr> <chr> <chr> <chr>
## 1 AT1G20520 ath Pyrco.da.v2a1.augustus.000230 pcox FastOMA
## 2 AT1G76210 ath Pyrco.da.v2a1.augustus.000230 pcox FastOMA
## 3 AT5G53090 ath Pyrco.da.v2a1.snap.445350 pcox FastOMA
## 4 AT5G53100 ath Pyrco.da.v2a1.snap.445350 pcox FastOMA
## 5 AT1G03900 ath Pyrco.da.v2a1.chr10A.089110 pcox MCScanX
## 6 AT1G03920 ath Pyrco.da.v2a1.chr10A.089090 pcox MCScanX
## 7 AT5G56870 ath Pyrco.da.v2a1.chr9A.225970 pcox MCScanX
## 8 AT5G57035 ath Pyrco.da.v2a1.chr9A.225390 pcox MCScanX
## 9 AT1G01030 ath Pyrco.da.v2a1.chr14A.371380 pcox RBH
## 10 AT1G01030 ath Pyrco.da.v2a1.chr1A.345960 pcox RBH
## 11 ATMG01250 ath Pyrco.da.v2a1.chr5A.045340 pcox RBH
## 12 ATMG01250 ath Pyrco.da.v2a1.snap.153710 pcox RBH
rm(list = setdiff(ls(), c("df",
"ath.gmm", "ath.annot",
"group",
"plantName",
"plantNameOut",
"plantDirOut",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2738729 146.3 8762977 468.0 13692148 731.3
## Vcells 39962793 304.9 125086232 954.4 195447236 1491.2
## [1] 21603
## [1] 30838
##
## FastOMA MCScanX RBH
## 75969 33983 36090
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {
source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
source_cols = c("MCScanX", "FastOMA", 'RBH')
}
dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]
hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))dff = as.data.frame(dt.wide)
upset_plot = upset(
dff,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods") +
theme(legend.position = "none")
# Print or/and save the plot
print(upset_plot)# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)
ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)),
grep('from_count', colnames(dt.wide)),
grep('to_count', colnames(dt.wide)),
grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, ath.annot, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)
nin = ath.annot[which(!(ath.annot$id[!is.na(ath.annot$all_pathways)] %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
setorder(nin, short_name)
openxlsx::write.xlsx(nin,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-24-strict.xlsx'),
asTable = TRUE) # change namefp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]
colnames(combined)[2:4] = paste('plant', colnames(combined)[2:4], sep = '_')
colnames(df)## [1] "from_geneID" "to_geneID" "FastOMA" "MCScanX"
## [5] "RBH" "from_count" "to_count" "count_evidence"
## [9] "ath_BINCODE" "ath_NAME" "ath_DESCRIPTION" "all_pathways"
## [13] "short_name" "athName" "athSynonims"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$plant_BINCODE))##
## FALSE
## 95885
## character(0)
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]rm(list = setdiff(ls(), c("df",
"ath.gmm", "ath.annot",
"group",
"plantName",
"plantNameOut",
"plantDirOut",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2935517 156.8 8762977 468.0 13692148 731.3
## Vcells 40502642 309.1 125086232 954.4 195447236 1491.2
MapMan Mercator matches: first three levels only
df = df[!duplicated(df), ]
compare_bin <- function(athMercator, plantXMercator) {
# Helper: Split and truncate tokens to first 3 dot-separated levels
split_tokens <- function(code) {
if (is.na(code) || trimws(code) == "") return(character(0))
parts <- unlist(strsplit(code, "\\|"))
tokens <- unlist(strsplit(parts, ";"))
tokens <- unique(trimws(tokens))
trunc3levels <- function(token) {
levels <- unlist(strsplit(token, "\\."))
paste(head(levels, 3), collapse = ".")
}
unique(sapply(tokens, trunc3levels))
}
bin_set <- split_tokens(athMercator)
v4_set <- split_tokens(plantXMercator)
# If both sets are empty, return "no match"
if (length(bin_set) == 0 && length(v4_set) == 0) {
return("no match")
}
# Check for redundant annotation (e.g. "35.2|35.2|35.2")
v4_parts <- unlist(strsplit(plantXMercator, "\\|"))
if (length(bin_set) == 1 &&
length(v4_parts) > 1 &&
all(split_tokens(plantXMercator) == bin_set)) {
result <- paste0("100% match based on ", bin_set)
if (result == "100% match based on 35.2") return("bad MapMan")
return(result)
}
# Check for exact match
if (setequal(bin_set, v4_set) && length(bin_set) > 0) {
result <- paste0("100% match based on ", paste(bin_set, collapse = ", "))
if (result == "100% match based on 35.2") return("bad MapMan")
return(result)
}
# Check for partial match
common_tokens <- intersect(bin_set, v4_set)
if (length(common_tokens) > 0) {
return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
}
return("no match")
}
df = df %>%
dplyr::rowwise() %>%
dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, plant_BINCODE)) %>% # change name
dplyr::ungroup()
table(df$ath_BINCODE[df$MapMan4_Match == "bad MapMan"], df$plant_BINCODE[df$MapMan4_Match == "bad MapMan"])##
## 35.2
## 35.2 16546
## #### #### before filter #### ####
cat("\t# ath unigue genes:", length(unique(df$from_geneID)), "\n",
"\t#", plantName, "unigue genes:", length(unique(df$to_geneID)), "\n",
"\t# ath highest connection degree: ", range(df$from_count)[2], "\n",
"\t#", plantName, "highest connection degree: ", range(df$to_count)[2], "\n",
"\t# genes in ath with degree > 30: ", length(unique(df$from_geneID[df$from_count > 30])), "\n",
"\t# genes in", plantName, "with degree > 30: ", length(unique(df$to_geneID[df$to_count > 30])), "\n\n"
)## # ath unigue genes: 21603
## # pcox unigue genes: 30838
## # ath highest connection degree: 136
## # pcox highest connection degree: 116
## # genes in ath with degree > 30: 261
## # genes in pcox with degree > 30: 287
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
if (flag == 1) {
methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) { # make flags
methods = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
methods = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
methods = c("MCScanX", "FastOMA", 'RBH')
}
match_categories = c("no match", "100% match", "partial match", "bad MapMan")
long_dt = rbindlist(lapply(methods, function(method) {
dt[, .(
Method = method,
Match_Type = match_categories,
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
)
)]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
fcase(
as.character(Match_Type) == "bad MapMan", "35.2",
default = as.character(Match_Type)
),
levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match bad MapMan no match partial match
## 60208 16546 10595 8536
##
## 100% match bad MapMan no match partial match
## 1 36081 8117 9410 7873
## 2 12902 4306 975 468
## 3 11225 4123 210 195
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Consistency of MapMan Match by # methods coverage",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
if (flag != 4) {
special_methods = c("OrthoDB", "FastOMA", 'RBH')
} else {
special_methods = c("FastOMA", 'RBH')
}
dt[, filter_criteria := "reject"]
covered_genes = character(nrow(dt)) # reset if needed
print(table(dt$filter_criteria))##
## reject
## 95885
priority_methods = setdiff(methods, special_methods)
dt[, use := TRUE]
method = NULL
for (method in priority_methods) {
# rows indices satisfying all conditions; . causes issue!
eligible_rows <- which(
dt$filter_criteria == "reject" &
dt[[method]] == TRUE &
dt$to_geneID %nin% covered_genes &
dt$use == TRUE
)
if(length(eligible_rows) > 0) {
# Update FILTER
dt[eligible_rows, filter_criteria := method]
# Update covered_genes
covered_genes = unique(c(covered_genes, dt[eligible_rows, to_geneID]))
# Update use
dt[to_geneID %in% covered_genes, use := FALSE]
} else {
cat("No eligible rows found for:", method, "\n")
}
}
# uncovered genes
eligible_rows = which(
dt$filter_criteria == "reject" &
dt$use == TRUE &
grepl("match based on", dt$MapMan4_Match, ignore.case = TRUE) # MapMan match
)
if (length(eligible_rows) > 0) {
sub_dt = dt[eligible_rows] # a temporary snapshot used for fast vectorized computation
# how many special methods are TRUE per row
method_matrix = sapply(special_methods, function(m) sub_dt[[m]])
count_covered = rowSums(method_matrix, na.rm = TRUE)
new_criteria = rep("reject", length(eligible_rows))
# For rows with 2 or 3 methods
for (j in seq_along(eligible_rows)) {
if (count_covered[j] == 3) {
new_criteria[j] = "OrthoDB_FastOMA_RBH_MapMan4"
} else if (count_covered[j] == 2) {
covered_by = special_methods[method_matrix[j, ]]
new_criteria[j] = paste0(paste(sort(covered_by), collapse = "_"), "_MapMan4")
}
}
# update
dt[eligible_rows, filter_criteria := new_criteria]
dt[eligible_rows, use := FALSE]
} else {
print("Nothing here!")
}
# table(dt$filter_criteria)
# table(dt[dt$use, MapMan4_Match])
dt[, use := NULL]
df = dt
data.table::fwrite(df,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-24-strict.txt'),
sep = '\t')
openxlsx::write.xlsx(df,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-24-strict.xlsx'),
asTable = TRUE)rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]
# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]
kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]
par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "ath", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "ath kept", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = plantName, xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = paste(plantName, "kept"), xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Degrees before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)long_dt = rbindlist(lapply(methods, function(method) {
kept[, .(
Method = method,
Match_Type = match_categories,
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
)
)]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
fcase(
as.character(Match_Type) == "bad MapMan", "35.2",
default = as.character(Match_Type)
),
levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method (post filter)",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match bad MapMan no match partial match
## 25400 8415 3269 994
##
## 100% match bad MapMan no match partial match
## 1 3024 1653 2355 395
## 2 11151 2639 704 404
## 3 11225 4123 210 195
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Consistency of MapMan Match by # methods coverage (post filter)",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria",
names(kept), value = TRUE)]
# rename 'bad MapMan' to '35.2'
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
keptsub$MapMan4_Match = ifelse(keptsub$MapMan4_Match == "bad MapMan", "35.2", keptsub$MapMan4_Match)
# frequency table
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
# Filter
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria,
# levels = c("MCScanX", "compara", "PLAZA",
# "OrthoDB_FastOMA_RBH",
# "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH",
# "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
levels = c("MCScanX", "compara", "PLAZA",
"OrthoDB_FastOMA_RBH_MapMan4",
"FastOMA_OrthoDB_MapMan4", "OrthoDB_RBH_MapMan4", "FastOMA_RBH_MapMan4"
))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', '35.2', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
labs(
title = "Selection by method and methods coverage",
x = "Filter criteria",
y = "Frequency",
fill = "MapMan4 Match"
) +
theme_minimal() +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
theme(
axis.text.x = element_text(angle = 90, hjust = 1),
panel.border = element_rect(color = "black", fill = NA, size = 1), # border around each facet
)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
openxlsx::write.xlsx(rejected,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-removed_2025-09-24-strict.xlsx'),
asTable = TRUE)
edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
geneID = names(comp$membership),
weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
# # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept,
paste0('../output/', plantNameOut , '-ath_orthologues-kept_2025-09-24-strict.xlsx'),
asTable = TRUE)
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) { # make flags
source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
source_cols = c("MCScanX", "FastOMA", 'RBH')
}
# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
kept,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")
# Print or save the plot
print(upset_plot)ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-24-strict.pdf"),
plot = upset_plot, width = 24, height = 6, device = "pdf")
cat('#### #### after filter #### #### \n')## #### #### after filter #### ####
cat("\t# ath unigue genes:", length(unique(kept$from_geneID)), "\n",
"\t#", plantName, "unigue genes:", length(unique(kept$to_geneID)), "\n",
"\t# ath highest connection degree: ", range(kept$from_count)[2], "\n",
"\t#", plantName, "highest connection degree: ", range(kept$to_count)[2], "\n",
"\t# genes in ath with degree > 30: ", length(unique(kept$from_geneID[kept$from_count > 30])), "\n",
"\t# genes in", plantName, "with degree > 30: ", length(unique(kept$to_geneID[kept$to_count > 30])), "\n\n"
)## # ath unigue genes: 17474
## # pcox unigue genes: 24456
## # ath highest connection degree: 23
## # pcox highest connection degree: 20
## # genes in ath with degree > 30: 0
## # genes in pcox with degree > 30: 0
filter_counts = as.data.table(table(dt$filter_criteria))
setnames(filter_counts, c("filter_criteria", "Count"))
# Define desired order
desired_order = c(
"MCScanX",
"compara",
"PLAZA",
# "OrthoDB_FastOMA_RBH",
# "OrthoDB_RBH",
# "FastOMA_OrthoDB",
# "FastOMA_RBH",
# "OrthoDB_MapMan4",
# "RBH_MapMan4",
# "FastOMA_MapMan4
"OrthoDB_FastOMA_RBH_MapMan4",
"FastOMA_OrthoDB_MapMan4",
"OrthoDB_RBH_MapMan4",
"FastOMA_RBH_MapMan4",
"reject"
)
filter_counts = filter_counts[filter_criteria %in% desired_order]
filter_counts[, filter_criteria := factor(filter_criteria, levels = desired_order)]
setorder(filter_counts, filter_criteria)
print(filter_counts)## filter_criteria Count
## <fctr> <int>
## 1: MCScanX 33983
## 2: FastOMA_RBH_MapMan4 4095
## 3: reject 57807
pss = ath.annot[which(!is.na(ath.annot$all_pathways)), ]
pss = pss[, grep("id$|all_pathways$|short_name$", colnames(pss))]
pss = pss[!duplicated(pss), ]
pss = merge(pss,
df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria",
names(dt), value = TRUE)],
by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss = pss[grep('^AT', pss$id), ]
pss = pss[!duplicated(pss), ]
table(pss$filter_criteria)##
## FastOMA_RBH_MapMan4 MCScanX reject
## 161 1459 1684
openxlsx::write.xlsx(pss,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-24-strict.xlsx'),
asTable = TRUE)params_list <- list(
plantName = 'pcer'
, plantNameOut = "cherryplum"
, plantDirOut = file.path('..', 'reports', group, "cherryplum")
, flag = 4
)
# note: in compara - geneID and prot ID are completely different
env <- new.env()
list2env(params_list, envir = env)<environment: 0x00000260d6e47900>
##
##
## processing file: ./09_translate-child_strict.Rmd
| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-178] | |…… | 11% | |……. | 15% [unnamed-chunk-179] | |……… | 19% | |……….. | 22% [unnamed-chunk-180] | |…………. | 26% | |…………… | 30% [unnamed-chunk-181] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-182] | |……………….. | 41% | |…………………. | 44% [unnamed-chunk-183] | |…………………… | 48% | |…………………….. | 52% [unnamed-chunk-184] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-185] | |…………………………. | 63% | |…………………………… | 67% [unnamed-chunk-186] | |…………………………….. | 70% | |………………………………. | 74% [unnamed-chunk-187] | |………………………………… | 78% | |………………………………….. | 81% [unnamed-chunk-188] | |……………………………………. | 85% | |…………………………………….. | 89% [unnamed-chunk-189] | |………………………………………. | 93% | |………………………………………… | 96% [unnamed-chunk-190] | |…………………………………………..| 100%
fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]
df = NULL
for (i in fl){
print(i)
dt = data.table::fread(i)
us = unique(dt$source)
if(us == 'compara') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'FastOMA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'MCScanX') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'PLAZA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'OrthoDB') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'RBH') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else cat ('Ignore source(s):', us, '\n')
}## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
## Pre filter Sources:
## 162100 63358 80487
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID"
df %>%
dplyr::group_by(source) %>%
dplyr::slice_head(n = 2) %>%
dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
dplyr::arrange(source) %>%
dplyr::ungroup() -> first_last_three_per_source
print(first_last_three_per_source, n = nrow(first_last_three_per_source))## # A tibble: 12 × 5
## from_geneID from_plant to_geneID to_plant source
## <chr> <chr> <chr> <chr> <chr>
## 1 AT2G32760 ath Pcer_000001-RA pcer FastOMA
## 2 AT1G07920 ath Pcer_000002-RA pcer FastOMA
## 3 AT1G16650 ath Pcer_097557-RA pcer FastOMA
## 4 AT1G53530 ath Pcer_097558-RA pcer FastOMA
## 5 AT1G04220 ath Pcer_000137-RA pcer MCScanX
## 6 AT1G04230 ath Pcer_000150-RA pcer MCScanX
## 7 ATMG00080 ath Pcer_091446-RA pcer MCScanX
## 8 ATMG00513 ath Pcer_091469-RA pcer MCScanX
## 9 AT1G01030 ath Pcer_027461-RA pcer RBH
## 10 AT1G01030 ath Pcer_038773-RA pcer RBH
## 11 ATMG01330 ath Pcer_091451-RA pcer RBH
## 12 ATMG01360 ath Pcer_096779-RA pcer RBH
rm(list = setdiff(ls(), c("df",
"ath.gmm", "ath.annot",
"group",
"plantName",
"plantNameOut",
"plantDirOut",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2648330 141.5 8762977 468.0 13692148 731.3
## Vcells 38924177 297.0 125086232 954.4 195447236 1491.2
## [1] 22197
## [1] 71437
##
## FastOMA MCScanX RBH
## 162100 63358 80487
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {
source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
source_cols = c("MCScanX", "FastOMA", 'RBH')
}
dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]
hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))dff = as.data.frame(dt.wide)
upset_plot = upset(
dff,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods") +
theme(legend.position = "none")
# Print or/and save the plot
print(upset_plot)# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)
ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)),
grep('from_count', colnames(dt.wide)),
grep('to_count', colnames(dt.wide)),
grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, ath.annot, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)
nin = ath.annot[which(!(ath.annot$id[!is.na(ath.annot$all_pathways)] %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
setorder(nin, short_name)
openxlsx::write.xlsx(nin,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-24-strict.xlsx'),
asTable = TRUE) # change namefp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]
colnames(combined)[2:4] = paste('plant', colnames(combined)[2:4], sep = '_')
colnames(df)## [1] "from_geneID" "to_geneID" "FastOMA" "MCScanX"
## [5] "RBH" "from_count" "to_count" "count_evidence"
## [9] "ath_BINCODE" "ath_NAME" "ath_DESCRIPTION" "all_pathways"
## [13] "short_name" "athName" "athSynonims"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$plant_BINCODE))##
## FALSE TRUE
## 201293 3
## [1] "Pcer_097367-RB" "Pcer_097392-RB" "Pcer_097544-RB"
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]rm(list = setdiff(ls(), c("df",
"ath.gmm", "ath.annot",
"group",
"plantName",
"plantNameOut",
"plantDirOut",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3451314 184.4 8762977 468.0 13692148 731.3
## Vcells 56116124 428.2 125234665 955.5 195447236 1491.2
MapMan Mercator matches: first three levels only
df = df[!duplicated(df), ]
compare_bin <- function(athMercator, plantXMercator) {
# Helper: Split and truncate tokens to first 3 dot-separated levels
split_tokens <- function(code) {
if (is.na(code) || trimws(code) == "") return(character(0))
parts <- unlist(strsplit(code, "\\|"))
tokens <- unlist(strsplit(parts, ";"))
tokens <- unique(trimws(tokens))
trunc3levels <- function(token) {
levels <- unlist(strsplit(token, "\\."))
paste(head(levels, 3), collapse = ".")
}
unique(sapply(tokens, trunc3levels))
}
bin_set <- split_tokens(athMercator)
v4_set <- split_tokens(plantXMercator)
# If both sets are empty, return "no match"
if (length(bin_set) == 0 && length(v4_set) == 0) {
return("no match")
}
# Check for redundant annotation (e.g. "35.2|35.2|35.2")
v4_parts <- unlist(strsplit(plantXMercator, "\\|"))
if (length(bin_set) == 1 &&
length(v4_parts) > 1 &&
all(split_tokens(plantXMercator) == bin_set)) {
result <- paste0("100% match based on ", bin_set)
if (result == "100% match based on 35.2") return("bad MapMan")
return(result)
}
# Check for exact match
if (setequal(bin_set, v4_set) && length(bin_set) > 0) {
result <- paste0("100% match based on ", paste(bin_set, collapse = ", "))
if (result == "100% match based on 35.2") return("bad MapMan")
return(result)
}
# Check for partial match
common_tokens <- intersect(bin_set, v4_set)
if (length(common_tokens) > 0) {
return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
}
return("no match")
}
df = df %>%
dplyr::rowwise() %>%
dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, plant_BINCODE)) %>% # change name
dplyr::ungroup()
table(df$ath_BINCODE[df$MapMan4_Match == "bad MapMan"], df$plant_BINCODE[df$MapMan4_Match == "bad MapMan"])##
## 35.2
## 35.2 40662
## #### #### before filter #### ####
cat("\t# ath unigue genes:", length(unique(df$from_geneID)), "\n",
"\t#", plantName, "unigue genes:", length(unique(df$to_geneID)), "\n",
"\t# ath highest connection degree: ", range(df$from_count)[2], "\n",
"\t#", plantName, "highest connection degree: ", range(df$to_count)[2], "\n",
"\t# genes in ath with degree > 30: ", length(unique(df$from_geneID[df$from_count > 30])), "\n",
"\t# genes in", plantName, "with degree > 30: ", length(unique(df$to_geneID[df$to_count > 30])), "\n\n"
)## # ath unigue genes: 22197
## # pcer unigue genes: 71437
## # ath highest connection degree: 270
## # pcer highest connection degree: 113
## # genes in ath with degree > 30: 890
## # genes in pcer with degree > 30: 449
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
if (flag == 1) {
methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) { # make flags
methods = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
methods = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
methods = c("MCScanX", "FastOMA", 'RBH')
}
match_categories = c("no match", "100% match", "partial match", "bad MapMan")
long_dt = rbindlist(lapply(methods, function(method) {
dt[, .(
Method = method,
Match_Type = match_categories,
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
)
)]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
fcase(
as.character(Match_Type) == "bad MapMan", "35.2",
default = as.character(Match_Type)
),
levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match bad MapMan no match partial match
## 126211 40662 19225 15198
##
## 100% match bad MapMan no match partial match
## 1 75114 21956 16707 13585
## 2 29613 10371 1995 1240
## 3 21484 8335 523 373
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Consistency of MapMan Match by # methods coverage",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
if (flag != 4) {
special_methods = c("OrthoDB", "FastOMA", 'RBH')
} else {
special_methods = c("FastOMA", 'RBH')
}
dt[, filter_criteria := "reject"]
covered_genes = character(nrow(dt)) # reset if needed
print(table(dt$filter_criteria))##
## reject
## 201296
priority_methods = setdiff(methods, special_methods)
dt[, use := TRUE]
method = NULL
for (method in priority_methods) {
# rows indices satisfying all conditions; . causes issue!
eligible_rows <- which(
dt$filter_criteria == "reject" &
dt[[method]] == TRUE &
dt$to_geneID %nin% covered_genes &
dt$use == TRUE
)
if(length(eligible_rows) > 0) {
# Update FILTER
dt[eligible_rows, filter_criteria := method]
# Update covered_genes
covered_genes = unique(c(covered_genes, dt[eligible_rows, to_geneID]))
# Update use
dt[to_geneID %in% covered_genes, use := FALSE]
} else {
cat("No eligible rows found for:", method, "\n")
}
}
# uncovered genes
eligible_rows = which(
dt$filter_criteria == "reject" &
dt$use == TRUE &
grepl("match based on", dt$MapMan4_Match, ignore.case = TRUE) # MapMan match
)
if (length(eligible_rows) > 0) {
sub_dt = dt[eligible_rows] # a temporary snapshot used for fast vectorized computation
# how many special methods are TRUE per row
method_matrix = sapply(special_methods, function(m) sub_dt[[m]])
count_covered = rowSums(method_matrix, na.rm = TRUE)
new_criteria = rep("reject", length(eligible_rows))
# For rows with 2 or 3 methods
for (j in seq_along(eligible_rows)) {
if (count_covered[j] == 3) {
new_criteria[j] = "OrthoDB_FastOMA_RBH_MapMan4"
} else if (count_covered[j] == 2) {
covered_by = special_methods[method_matrix[j, ]]
new_criteria[j] = paste0(paste(sort(covered_by), collapse = "_"), "_MapMan4")
}
}
# update
dt[eligible_rows, filter_criteria := new_criteria]
dt[eligible_rows, use := FALSE]
} else {
print("Nothing here!")
}
# table(dt$filter_criteria)
# table(dt[dt$use, MapMan4_Match])
dt[, use := NULL]
df = dt
data.table::fwrite(df,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-24-strict.txt'),
sep = '\t')
openxlsx::write.xlsx(df,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-24-strict.xlsx'),
asTable = TRUE)rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]
# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]
kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]
par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "ath", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "ath kept", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = plantName, xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = paste(plantName, "kept"), xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Degrees before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)long_dt = rbindlist(lapply(methods, function(method) {
kept[, .(
Method = method,
Match_Type = match_categories,
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
)
)]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
fcase(
as.character(Match_Type) == "bad MapMan", "35.2",
default = as.character(Match_Type)
),
levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method (post filter)",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match bad MapMan no match partial match
## 54173 15829 3833 2176
##
## 100% match bad MapMan no match partial match
## 1 6674 2600 2064 701
## 2 26015 4894 1246 1102
## 3 21484 8335 523 373
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Consistency of MapMan Match by # methods coverage (post filter)",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria",
names(kept), value = TRUE)]
# rename 'bad MapMan' to '35.2'
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
keptsub$MapMan4_Match = ifelse(keptsub$MapMan4_Match == "bad MapMan", "35.2", keptsub$MapMan4_Match)
# frequency table
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
# Filter
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria,
# levels = c("MCScanX", "compara", "PLAZA",
# "OrthoDB_FastOMA_RBH",
# "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH",
# "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
levels = c("MCScanX", "compara", "PLAZA",
"OrthoDB_FastOMA_RBH_MapMan4",
"FastOMA_OrthoDB_MapMan4", "OrthoDB_RBH_MapMan4", "FastOMA_RBH_MapMan4"
))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', '35.2', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
labs(
title = "Selection by method and methods coverage",
x = "Filter criteria",
y = "Frequency",
fill = "MapMan4 Match"
) +
theme_minimal() +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
theme(
axis.text.x = element_text(angle = 90, hjust = 1),
panel.border = element_rect(color = "black", fill = NA, size = 1), # border around each facet
)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
openxlsx::write.xlsx(rejected,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-removed_2025-09-24-strict.xlsx'),
asTable = TRUE)
edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
geneID = names(comp$membership),
weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
# # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept,
paste0('../output/', plantNameOut , '-ath_orthologues-kept_2025-09-24-strict.xlsx'),
asTable = TRUE)
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) { # make flags
source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
source_cols = c("MCScanX", "FastOMA", 'RBH')
}
# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
kept,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")
# Print or save the plot
print(upset_plot)ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-24-strict.pdf"),
plot = upset_plot, width = 24, height = 6, device = "pdf")
cat('#### #### after filter #### #### \n')## #### #### after filter #### ####
cat("\t# ath unigue genes:", length(unique(kept$from_geneID)), "\n",
"\t#", plantName, "unigue genes:", length(unique(kept$to_geneID)), "\n",
"\t# ath highest connection degree: ", range(kept$from_count)[2], "\n",
"\t#", plantName, "highest connection degree: ", range(kept$to_count)[2], "\n",
"\t# genes in ath with degree > 30: ", length(unique(kept$from_geneID[kept$from_count > 30])), "\n",
"\t# genes in", plantName, "with degree > 30: ", length(unique(kept$to_geneID[kept$to_count > 30])), "\n\n"
)## # ath unigue genes: 17856
## # pcer unigue genes: 50879
## # ath highest connection degree: 64
## # pcer highest connection degree: 20
## # genes in ath with degree > 30: 30
## # genes in pcer with degree > 30: 0
filter_counts = as.data.table(table(dt$filter_criteria))
setnames(filter_counts, c("filter_criteria", "Count"))
# Define desired order
desired_order = c(
"MCScanX",
"compara",
"PLAZA",
# "OrthoDB_FastOMA_RBH",
# "OrthoDB_RBH",
# "FastOMA_OrthoDB",
# "FastOMA_RBH",
# "OrthoDB_MapMan4",
# "RBH_MapMan4",
# "FastOMA_MapMan4
"OrthoDB_FastOMA_RBH_MapMan4",
"FastOMA_OrthoDB_MapMan4",
"OrthoDB_RBH_MapMan4",
"FastOMA_RBH_MapMan4",
"reject"
)
filter_counts = filter_counts[filter_criteria %in% desired_order]
filter_counts[, filter_criteria := factor(filter_criteria, levels = desired_order)]
setorder(filter_counts, filter_criteria)
print(filter_counts)## filter_criteria Count
## <fctr> <int>
## 1: MCScanX 63358
## 2: FastOMA_RBH_MapMan4 12653
## 3: reject 125285
pss = ath.annot[which(!is.na(ath.annot$all_pathways)), ]
pss = pss[, grep("id$|all_pathways$|short_name$", colnames(pss))]
pss = pss[!duplicated(pss), ]
pss = merge(pss,
df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria",
names(dt), value = TRUE)],
by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss = pss[grep('^AT', pss$id), ]
pss = pss[!duplicated(pss), ]
table(pss$filter_criteria)##
## FastOMA_RBH_MapMan4 MCScanX reject
## 547 2541 4379
openxlsx::write.xlsx(pss,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-24-strict.xlsx'),
asTable = TRUE)params_list <- list(
plantName = 'psib'
, plantNameOut = "siberianapricot"
, plantDirOut = file.path('..', 'reports', group, "siberianapricot")
, flag = 3
)
env <- new.env()
list2env(params_list, envir = env)<environment: 0x0000026078b404d8>
##
##
## processing file: ./09_translate-child_strict.Rmd
| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-206] | |…… | 11% | |……. | 15% [unnamed-chunk-207] | |……… | 19% | |……….. | 22% [unnamed-chunk-208] | |…………. | 26% | |…………… | 30% [unnamed-chunk-209] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-210] | |……………….. | 41% | |…………………. | 44% [unnamed-chunk-211] | |…………………… | 48% | |…………………….. | 52% [unnamed-chunk-212] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-213] | |…………………………. | 63% | |…………………………… | 67% [unnamed-chunk-214] | |…………………………….. | 70% | |………………………………. | 74% [unnamed-chunk-215] | |………………………………… | 78% | |………………………………….. | 81% [unnamed-chunk-216] | |……………………………………. | 85% | |…………………………………….. | 89% [unnamed-chunk-217] | |………………………………………. | 93% | |………………………………………… | 96% [unnamed-chunk-218] | |…………………………………………..| 100%
fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]
df = NULL
for (i in fl){
print(i)
dt = data.table::fread(i)
us = unique(dt$source)
if(us == 'compara') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'FastOMA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'MCScanX') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'PLAZA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'OrthoDB') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'RBH') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else cat ('Ignore source(s):', us, '\n')
}## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
## Pre filter Sources:
## 40732 16398 20889 25288
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID"
df %>%
dplyr::group_by(source) %>%
dplyr::slice_head(n = 2) %>%
dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
dplyr::arrange(source) %>%
dplyr::ungroup() -> first_last_three_per_source
print(first_last_three_per_source, n = nrow(first_last_three_per_source))## # A tibble: 16 × 5
## from_geneID from_plant to_geneID to_plant source
## <chr> <chr> <chr> <chr> <chr>
## 1 AT1G12040 ath PaF106G0100000005 psib FastOMA
## 2 AT1G62440 ath PaF106G0100000005 psib FastOMA
## 3 AT3G07140 ath PaF106G0800032954 psib FastOMA
## 4 AT3G07140 ath PaF106G0800032956 psib FastOMA
## 5 AT1G04230 ath PaF106G0100000358 psib MCScanX
## 6 AT1G04240 ath PaF106G0100000361 psib MCScanX
## 7 ATCG00340 ath PaF106G0700028636 psib MCScanX
## 8 ATCG00350 ath PaF106G0700028635 psib MCScanX
## 9 AT2G43200 ath PaF106G0500020125 psib OrthoDB
## 10 AT1G52770 ath PaF106G0300013722 psib OrthoDB
## 11 AT3G56950 ath PaF106G0700028984 psib OrthoDB
## 12 AT5G37530 ath PaF106G0300012640 psib OrthoDB
## 13 AT1G01030 ath PaF106G0500020091 psib RBH
## 14 AT1G01040 ath PaF106G0200009357 psib RBH
## 15 ATMG01190 ath PaF106G0600023358 psib RBH
## 16 ATMG01250 ath PaF106G0700028671 psib RBH
rm(list = setdiff(ls(), c("df",
"ath.gmm", "ath.annot",
"group",
"plantName",
"plantNameOut",
"plantDirOut",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2825261 150.9 7010383 374.4 13692148 731.3
## Vcells 48771093 372.1 100187732 764.4 195447236 1491.2
## [1] 22040
## [1] 20342
##
## FastOMA MCScanX OrthoDB RBH
## 40732 16398 20889 25288
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {
source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
source_cols = c("MCScanX", "FastOMA", 'RBH')
}
dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]
hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))dff = as.data.frame(dt.wide)
upset_plot = upset(
dff,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods") +
theme(legend.position = "none")
# Print or/and save the plot
print(upset_plot)# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)
ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)),
grep('from_count', colnames(dt.wide)),
grep('to_count', colnames(dt.wide)),
grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, ath.annot, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)
nin = ath.annot[which(!(ath.annot$id[!is.na(ath.annot$all_pathways)] %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
setorder(nin, short_name)
openxlsx::write.xlsx(nin,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-24-strict.xlsx'),
asTable = TRUE) # change namefp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]
colnames(combined)[2:4] = paste('plant', colnames(combined)[2:4], sep = '_')
colnames(df)## [1] "from_geneID" "to_geneID" "FastOMA" "MCScanX"
## [5] "OrthoDB" "RBH" "from_count" "to_count"
## [9] "count_evidence" "ath_BINCODE" "ath_NAME" "ath_DESCRIPTION"
## [13] "all_pathways" "short_name" "athName" "athSynonims"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$plant_BINCODE))##
## FALSE
## 58143
## character(0)
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]rm(list = setdiff(ls(), c("df",
"ath.gmm", "ath.annot",
"group",
"plantName",
"plantNameOut",
"plantDirOut",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3065509 163.8 7010383 374.4 13692148 731.3
## Vcells 40880778 311.9 100187732 764.4 195447236 1491.2
MapMan Mercator matches: first three levels only
df = df[!duplicated(df), ]
compare_bin <- function(athMercator, plantXMercator) {
# Helper: Split and truncate tokens to first 3 dot-separated levels
split_tokens <- function(code) {
if (is.na(code) || trimws(code) == "") return(character(0))
parts <- unlist(strsplit(code, "\\|"))
tokens <- unlist(strsplit(parts, ";"))
tokens <- unique(trimws(tokens))
trunc3levels <- function(token) {
levels <- unlist(strsplit(token, "\\."))
paste(head(levels, 3), collapse = ".")
}
unique(sapply(tokens, trunc3levels))
}
bin_set <- split_tokens(athMercator)
v4_set <- split_tokens(plantXMercator)
# If both sets are empty, return "no match"
if (length(bin_set) == 0 && length(v4_set) == 0) {
return("no match")
}
# Check for redundant annotation (e.g. "35.2|35.2|35.2")
v4_parts <- unlist(strsplit(plantXMercator, "\\|"))
if (length(bin_set) == 1 &&
length(v4_parts) > 1 &&
all(split_tokens(plantXMercator) == bin_set)) {
result <- paste0("100% match based on ", bin_set)
if (result == "100% match based on 35.2") return("bad MapMan")
return(result)
}
# Check for exact match
if (setequal(bin_set, v4_set) && length(bin_set) > 0) {
result <- paste0("100% match based on ", paste(bin_set, collapse = ", "))
if (result == "100% match based on 35.2") return("bad MapMan")
return(result)
}
# Check for partial match
common_tokens <- intersect(bin_set, v4_set)
if (length(common_tokens) > 0) {
return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
}
return("no match")
}
df = df %>%
dplyr::rowwise() %>%
dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, plant_BINCODE)) %>% # change name
dplyr::ungroup()
table(df$ath_BINCODE[df$MapMan4_Match == "bad MapMan"], df$plant_BINCODE[df$MapMan4_Match == "bad MapMan"])##
## 35.2
## 35.2 12813
## #### #### before filter #### ####
cat("\t# ath unigue genes:", length(unique(df$from_geneID)), "\n",
"\t#", plantName, "unigue genes:", length(unique(df$to_geneID)), "\n",
"\t# ath highest connection degree: ", range(df$from_count)[2], "\n",
"\t#", plantName, "highest connection degree: ", range(df$to_count)[2], "\n",
"\t# genes in ath with degree > 30: ", length(unique(df$from_geneID[df$from_count > 30])), "\n",
"\t# genes in", plantName, "with degree > 30: ", length(unique(df$to_geneID[df$to_count > 30])), "\n\n"
)## # ath unigue genes: 22040
## # psib unigue genes: 20342
## # ath highest connection degree: 58
## # psib highest connection degree: 115
## # genes in ath with degree > 30: 135
## # genes in psib with degree > 30: 106
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
if (flag == 1) {
methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) { # make flags
methods = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
methods = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
methods = c("MCScanX", "FastOMA", 'RBH')
}
match_categories = c("no match", "100% match", "partial match", "bad MapMan")
long_dt = rbindlist(lapply(methods, function(method) {
dt[, .(
Method = method,
Match_Type = match_categories,
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
)
)]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
fcase(
as.character(Match_Type) == "bad MapMan", "35.2",
default = as.character(Match_Type)
),
levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match bad MapMan no match partial match
## 35534 12813 6004 3792
##
## 100% match bad MapMan no match partial match
## 1 18977 6640 5025 3323
## 2 6361 2039 641 312
## 3 6065 2253 249 97
## 4 4131 1881 89 60
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Consistency of MapMan Match by # methods coverage",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
if (flag != 4) {
special_methods = c("OrthoDB", "FastOMA", 'RBH')
} else {
special_methods = c("FastOMA", 'RBH')
}
dt[, filter_criteria := "reject"]
covered_genes = character(nrow(dt)) # reset if needed
print(table(dt$filter_criteria))##
## reject
## 58143
priority_methods = setdiff(methods, special_methods)
dt[, use := TRUE]
method = NULL
for (method in priority_methods) {
# rows indices satisfying all conditions; . causes issue!
eligible_rows <- which(
dt$filter_criteria == "reject" &
dt[[method]] == TRUE &
dt$to_geneID %nin% covered_genes &
dt$use == TRUE
)
if(length(eligible_rows) > 0) {
# Update FILTER
dt[eligible_rows, filter_criteria := method]
# Update covered_genes
covered_genes = unique(c(covered_genes, dt[eligible_rows, to_geneID]))
# Update use
dt[to_geneID %in% covered_genes, use := FALSE]
} else {
cat("No eligible rows found for:", method, "\n")
}
}
# uncovered genes
eligible_rows = which(
dt$filter_criteria == "reject" &
dt$use == TRUE &
grepl("match based on", dt$MapMan4_Match, ignore.case = TRUE) # MapMan match
)
if (length(eligible_rows) > 0) {
sub_dt = dt[eligible_rows] # a temporary snapshot used for fast vectorized computation
# how many special methods are TRUE per row
method_matrix = sapply(special_methods, function(m) sub_dt[[m]])
count_covered = rowSums(method_matrix, na.rm = TRUE)
new_criteria = rep("reject", length(eligible_rows))
# For rows with 2 or 3 methods
for (j in seq_along(eligible_rows)) {
if (count_covered[j] == 3) {
new_criteria[j] = "OrthoDB_FastOMA_RBH_MapMan4"
} else if (count_covered[j] == 2) {
covered_by = special_methods[method_matrix[j, ]]
new_criteria[j] = paste0(paste(sort(covered_by), collapse = "_"), "_MapMan4")
}
}
# update
dt[eligible_rows, filter_criteria := new_criteria]
dt[eligible_rows, use := FALSE]
} else {
print("Nothing here!")
}
# table(dt$filter_criteria)
# table(dt[dt$use, MapMan4_Match])
dt[, use := NULL]
df = dt
data.table::fwrite(df,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-24-strict.txt'),
sep = '\t')
openxlsx::write.xlsx(df,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-24-strict.xlsx'),
asTable = TRUE)rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]
# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]
kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]
par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "ath", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "ath kept", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = plantName, xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = paste(plantName, "kept"), xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Degrees before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)long_dt = rbindlist(lapply(methods, function(method) {
kept[, .(
Method = method,
Match_Type = match_categories,
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
)
)]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
fcase(
as.character(Match_Type) == "bad MapMan", "35.2",
default = as.character(Match_Type)
),
levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method (post filter)",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match bad MapMan no match partial match
## 15378 4403 1448 545
##
## 100% match bad MapMan no match partial match
## 1 1154 859 878 174
## 2 4582 627 312 219
## 3 5511 1036 169 92
## 4 4131 1881 89 60
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Consistency of MapMan Match by # methods coverage (post filter)",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria",
names(kept), value = TRUE)]
# rename 'bad MapMan' to '35.2'
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
keptsub$MapMan4_Match = ifelse(keptsub$MapMan4_Match == "bad MapMan", "35.2", keptsub$MapMan4_Match)
# frequency table
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
# Filter
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria,
# levels = c("MCScanX", "compara", "PLAZA",
# "OrthoDB_FastOMA_RBH",
# "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH",
# "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
levels = c("MCScanX", "compara", "PLAZA",
"OrthoDB_FastOMA_RBH_MapMan4",
"FastOMA_OrthoDB_MapMan4", "OrthoDB_RBH_MapMan4", "FastOMA_RBH_MapMan4"
))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', '35.2', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
labs(
title = "Selection by method and methods coverage",
x = "Filter criteria",
y = "Frequency",
fill = "MapMan4 Match"
) +
theme_minimal() +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
theme(
axis.text.x = element_text(angle = 90, hjust = 1),
panel.border = element_rect(color = "black", fill = NA, size = 1), # border around each facet
)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
openxlsx::write.xlsx(rejected,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-removed_2025-09-24-strict.xlsx'),
asTable = TRUE)
edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
geneID = names(comp$membership),
weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
# # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept,
paste0('../output/', plantNameOut , '-ath_orthologues-kept_2025-09-24-strict.xlsx'),
asTable = TRUE)
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) { # make flags
source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
source_cols = c("MCScanX", "FastOMA", 'RBH')
}
# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
kept,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")
# Print or save the plot
print(upset_plot)ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-24-strict.pdf"),
plot = upset_plot, width = 24, height = 6, device = "pdf")
cat('#### #### after filter #### #### \n')## #### #### after filter #### ####
cat("\t# ath unigue genes:", length(unique(kept$from_geneID)), "\n",
"\t#", plantName, "unigue genes:", length(unique(kept$to_geneID)), "\n",
"\t# ath highest connection degree: ", range(kept$from_count)[2], "\n",
"\t#", plantName, "highest connection degree: ", range(kept$to_count)[2], "\n",
"\t# genes in ath with degree > 30: ", length(unique(kept$from_geneID[kept$from_count > 30])), "\n",
"\t# genes in", plantName, "with degree > 30: ", length(unique(kept$to_geneID[kept$to_count > 30])), "\n\n"
)## # ath unigue genes: 15980
## # psib unigue genes: 14649
## # ath highest connection degree: 22
## # psib highest connection degree: 15
## # genes in ath with degree > 30: 0
## # genes in psib with degree > 30: 0
filter_counts = as.data.table(table(dt$filter_criteria))
setnames(filter_counts, c("filter_criteria", "Count"))
# Define desired order
desired_order = c(
"MCScanX",
"compara",
"PLAZA",
# "OrthoDB_FastOMA_RBH",
# "OrthoDB_RBH",
# "FastOMA_OrthoDB",
# "FastOMA_RBH",
# "OrthoDB_MapMan4",
# "RBH_MapMan4",
# "FastOMA_MapMan4
"OrthoDB_FastOMA_RBH_MapMan4",
"FastOMA_OrthoDB_MapMan4",
"OrthoDB_RBH_MapMan4",
"FastOMA_RBH_MapMan4",
"reject"
)
filter_counts = filter_counts[filter_criteria %in% desired_order]
filter_counts[, filter_criteria := factor(filter_criteria, levels = desired_order)]
setorder(filter_counts, filter_criteria)
print(filter_counts)## filter_criteria Count
## <fctr> <int>
## 1: MCScanX 16398
## 2: OrthoDB_FastOMA_RBH_MapMan4 2598
## 3: FastOMA_OrthoDB_MapMan4 731
## 4: OrthoDB_RBH_MapMan4 595
## 5: FastOMA_RBH_MapMan4 1452
## 6: reject 36369
pss = ath.annot[which(!is.na(ath.annot$all_pathways)), ]
pss = pss[, grep("id$|all_pathways$|short_name$", colnames(pss))]
pss = pss[!duplicated(pss), ]
pss = merge(pss,
df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria",
names(dt), value = TRUE)],
by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss = pss[grep('^AT', pss$id), ]
pss = pss[!duplicated(pss), ]
table(pss$filter_criteria)##
## FastOMA_OrthoDB_MapMan4 FastOMA_RBH_MapMan4
## 37 64
## MCScanX OrthoDB_FastOMA_RBH_MapMan4
## 674 93
## OrthoDB_RBH_MapMan4 reject
## 33 1259
## R version 4.4.1 (2024-06-14 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
##
## Matrix products: default
##
##
## locale:
## [1] LC_COLLATE=English_United Kingdom.utf8
## [2] LC_CTYPE=English_United Kingdom.utf8
## [3] LC_MONETARY=English_United Kingdom.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United Kingdom.utf8
##
## time zone: Europe/Ljubljana
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ComplexUpset_1.3.3 ggplot2_3.5.2 knitr_1.50 data.table_1.17.0
## [5] magrittr_2.0.3
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 jsonlite_2.0.0 crayon_1.5.3 dplyr_1.1.4
## [5] compiler_4.4.1 zip_2.3.2 Rcpp_1.0.14 tidyselect_1.2.1
## [9] stringr_1.5.1 dichromat_2.0-0.1 jquerylib_0.1.4 textshaping_1.0.1
## [13] systemfonts_1.2.3 scales_1.4.0 yaml_2.3.10 fastmap_1.2.0
## [17] R6_2.6.1 labeling_0.4.3 generics_0.1.4 patchwork_1.3.0
## [21] igraph_2.1.4 openxlsx_4.2.8 tibble_3.2.1 bslib_0.9.0
## [25] pillar_1.10.2 RColorBrewer_1.1-3 rlang_1.1.5 utf8_1.2.5
## [29] cachem_1.1.0 stringi_1.8.7 xfun_0.52 sass_0.4.10
## [33] cli_3.6.3 withr_3.0.2 digest_0.6.37 grid_4.4.1
## [37] rstudioapi_0.17.1 lifecycle_1.0.4 vctrs_0.6.5 evaluate_1.0.3
## [41] glue_1.8.0 farver_2.1.2 ragg_1.4.0 colorspace_2.1-1
## [45] rmarkdown_2.29 tools_4.4.1 pkgconfig_2.0.3 htmltools_0.5.8.1